CINXE.COM

Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1&ndash;50 of 108 results for author: <span class="mathjax">Pan, J</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> <div class="content"> <form method="GET" action="/search/eess" aria-role="search"> Searching in archive <strong>eess</strong>. <a href="/search/?searchtype=author&amp;query=Pan%2C+J">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Pan, J"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Pan%2C+J&amp;terms-0-field=author&amp;size=50&amp;order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Pan, J"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Pan%2C+J&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Pan%2C+J&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Pan%2C+J&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Pan%2C+J&amp;start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.08904">arXiv:2411.08904</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.08904">pdf</a>, <a href="https://arxiv.org/format/2411.08904">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Generalized Scattering Matrix of Antenna: Moment Solution, Compression Storage and Application </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Shi%2C+C">Chenbo Shi</a>, <a href="/search/eess?searchtype=author&amp;query=Pan%2C+J">Jin Pan</a>, <a href="/search/eess?searchtype=author&amp;query=Gu%2C+X">Xin Gu</a>, <a href="/search/eess?searchtype=author&amp;query=Liang%2C+S">Shichen Liang</a>, <a href="/search/eess?searchtype=author&amp;query=Zuo%2C+L">Le Zuo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.08904v1-abstract-short" style="display: inline;"> This paper introduces a novel approach for computing the generalized scattering matrix (GSM) of antennas using the method of moments (MoM), circumventing many of the constraints associated with finite element methods (FEM). We also propose an effective storage scheme for the GSM, significantly reducing the storage burden for problems involving multiple frequency sampling points. Numerical results&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08904v1-abstract-full').style.display = 'inline'; document.getElementById('2411.08904v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.08904v1-abstract-full" style="display: none;"> This paper introduces a novel approach for computing the generalized scattering matrix (GSM) of antennas using the method of moments (MoM), circumventing many of the constraints associated with finite element methods (FEM). We also propose an effective storage scheme for the GSM, significantly reducing the storage burden for problems involving multiple frequency sampling points. Numerical results validate the correctness of our formulations. Moreover, we provide a practical example of using the GSM to analyze the coupling effects between elements of an antenna array. This approach allows for the independent consideration of the array and its individual elements, showcasing the substantial advantages of GSM. These examples underscore the potential of GSM to enhance antenna design and analysis, particularly in complex configurations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08904v1-abstract-full').style.display = 'none'; document.getElementById('2411.08904v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06667">arXiv:2411.06667</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.06667">pdf</a>, <a href="https://arxiv.org/format/2411.06667">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> DCF-DS: Deep Cascade Fusion of Diarization and Separation for Speech Recognition under Realistic Single-Channel Conditions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Niu%2C+S">Shu-Tong Niu</a>, <a href="/search/eess?searchtype=author&amp;query=Du%2C+J">Jun Du</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+R">Ruo-Yu Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Yang%2C+G">Gao-Bin Yang</a>, <a href="/search/eess?searchtype=author&amp;query=Gao%2C+T">Tian Gao</a>, <a href="/search/eess?searchtype=author&amp;query=Pan%2C+J">Jia Pan</a>, <a href="/search/eess?searchtype=author&amp;query=Hu%2C+Y">Yu Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06667v2-abstract-short" style="display: inline;"> We propose a single-channel Deep Cascade Fusion of Diarization and Separation (DCF-DS) framework for back-end speech recognition, combining neural speaker diarization (NSD) and speech separation (SS). First, we sequentially integrate the NSD and SS modules within a joint training framework, enabling the separation module to leverage speaker time boundaries from the diarization module effectively.&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06667v2-abstract-full').style.display = 'inline'; document.getElementById('2411.06667v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06667v2-abstract-full" style="display: none;"> We propose a single-channel Deep Cascade Fusion of Diarization and Separation (DCF-DS) framework for back-end speech recognition, combining neural speaker diarization (NSD) and speech separation (SS). First, we sequentially integrate the NSD and SS modules within a joint training framework, enabling the separation module to leverage speaker time boundaries from the diarization module effectively. Then, to complement DCF-DS training, we introduce a window-level decoding scheme that allows the DCF-DS framework to handle the sparse data convergence instability (SDCI) problem. We also explore using an NSD system trained on real datasets to provide more accurate speaker boundaries during decoding. Additionally, we incorporate an optional multi-input multi-output speech enhancement module (MIMO-SE) within the DCF-DS framework, which offers further performance gains. Finally, we enhance diarization results by re-clustering DCF-DS outputs, improving ASR accuracy. By incorporating the DCF-DS method, we achieved first place in the realistic single-channel track of the CHiME-8 NOTSOFAR-1 challenge. We also perform the evaluation on the open LibriCSS dataset, achieving a new state-of-the-art single-channel speech recognition performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06667v2-abstract-full').style.display = 'none'; document.getElementById('2411.06667v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21256">arXiv:2410.21256</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.21256">pdf</a>, <a href="https://arxiv.org/format/2410.21256">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Multi-modal AI for comprehensive breast cancer prognostication </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Witowski%2C+J">Jan Witowski</a>, <a href="/search/eess?searchtype=author&amp;query=Zeng%2C+K">Ken Zeng</a>, <a href="/search/eess?searchtype=author&amp;query=Cappadona%2C+J">Joseph Cappadona</a>, <a href="/search/eess?searchtype=author&amp;query=Elayoubi%2C+J">Jailan Elayoubi</a>, <a href="/search/eess?searchtype=author&amp;query=Chiru%2C+E+D">Elena Diana Chiru</a>, <a href="/search/eess?searchtype=author&amp;query=Chan%2C+N">Nancy Chan</a>, <a href="/search/eess?searchtype=author&amp;query=Kang%2C+Y">Young-Joon Kang</a>, <a href="/search/eess?searchtype=author&amp;query=Howard%2C+F">Frederick Howard</a>, <a href="/search/eess?searchtype=author&amp;query=Ostrovnaya%2C+I">Irina Ostrovnaya</a>, <a href="/search/eess?searchtype=author&amp;query=Fernandez-Granda%2C+C">Carlos Fernandez-Granda</a>, <a href="/search/eess?searchtype=author&amp;query=Schnabel%2C+F">Freya Schnabel</a>, <a href="/search/eess?searchtype=author&amp;query=Ozerdem%2C+U">Ugur Ozerdem</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+K">Kangning Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Steinsnyder%2C+Z">Zoe Steinsnyder</a>, <a href="/search/eess?searchtype=author&amp;query=Thakore%2C+N">Nitya Thakore</a>, <a href="/search/eess?searchtype=author&amp;query=Sadic%2C+M">Mohammad Sadic</a>, <a href="/search/eess?searchtype=author&amp;query=Yeung%2C+F">Frank Yeung</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+E">Elisa Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Hill%2C+T">Theodore Hill</a>, <a href="/search/eess?searchtype=author&amp;query=Swett%2C+B">Benjamin Swett</a>, <a href="/search/eess?searchtype=author&amp;query=Rigau%2C+D">Danielle Rigau</a>, <a href="/search/eess?searchtype=author&amp;query=Clayburn%2C+A">Andrew Clayburn</a>, <a href="/search/eess?searchtype=author&amp;query=Speirs%2C+V">Valerie Speirs</a>, <a href="/search/eess?searchtype=author&amp;query=Vetter%2C+M">Marcus Vetter</a>, <a href="/search/eess?searchtype=author&amp;query=Sojak%2C+L">Lina Sojak</a> , et al. (26 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21256v1-abstract-short" style="display: inline;"> Treatment selection in breast cancer is guided by molecular subtypes and clinical characteristics. Recurrence risk assessment plays a crucial role in personalizing treatment. Current methods, including genomic assays, have limited accuracy and clinical utility, leading to suboptimal decisions for many patients. We developed a test for breast cancer patient stratification based on digital pathology&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21256v1-abstract-full').style.display = 'inline'; document.getElementById('2410.21256v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21256v1-abstract-full" style="display: none;"> Treatment selection in breast cancer is guided by molecular subtypes and clinical characteristics. Recurrence risk assessment plays a crucial role in personalizing treatment. Current methods, including genomic assays, have limited accuracy and clinical utility, leading to suboptimal decisions for many patients. We developed a test for breast cancer patient stratification based on digital pathology and clinical characteristics using novel AI methods. Specifically, we utilized a vision transformer-based pan-cancer foundation model trained with self-supervised learning to extract features from digitized H&amp;E-stained slides. These features were integrated with clinical data to form a multi-modal AI test predicting cancer recurrence and death. The test was developed and evaluated using data from a total of 8,161 breast cancer patients across 15 cohorts originating from seven countries. Of these, 3,502 patients from five cohorts were used exclusively for evaluation, while the remaining patients were used for training. Our test accurately predicted our primary endpoint, disease-free interval, in the five external cohorts (C-index: 0.71 [0.68-0.75], HR: 3.63 [3.02-4.37, p&lt;0.01]). In a direct comparison (N=858), the AI test was more accurate than Oncotype DX, the standard-of-care 21-gene assay, with a C-index of 0.67 [0.61-0.74] versus 0.61 [0.49-0.73], respectively. Additionally, the AI test added independent information to Oncotype DX in a multivariate analysis (HR: 3.11 [1.91-5.09, p&lt;0.01)]). The test demonstrated robust accuracy across all major breast cancer subtypes, including TNBC (C-index: 0.71 [0.62-0.81], HR: 3.81 [2.35-6.17, p=0.02]), where no diagnostic tools are currently recommended by clinical guidelines. These results suggest that our AI test can improve accuracy, extend applicability to a wider range of patients, and enhance access to treatment selection tools. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21256v1-abstract-full').style.display = 'none'; document.getElementById('2410.21256v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.16803">arXiv:2409.16803</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.16803">pdf</a>, <a href="https://arxiv.org/format/2409.16803">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Incorporating Spatial Cues in Modular Speaker Diarization for Multi-channel Multi-party Meetings </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Wang%2C+R">Ruoyu Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Niu%2C+S">Shutong Niu</a>, <a href="/search/eess?searchtype=author&amp;query=Yang%2C+G">Gaobin Yang</a>, <a href="/search/eess?searchtype=author&amp;query=Du%2C+J">Jun Du</a>, <a href="/search/eess?searchtype=author&amp;query=Qian%2C+S">Shuangqing Qian</a>, <a href="/search/eess?searchtype=author&amp;query=Gao%2C+T">Tian Gao</a>, <a href="/search/eess?searchtype=author&amp;query=Pan%2C+J">Jia Pan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.16803v1-abstract-short" style="display: inline;"> Although fully end-to-end speaker diarization systems have made significant progress in recent years, modular systems often achieve superior results in real-world scenarios due to their greater adaptability and robustness. Historically, modular speaker diarization methods have seldom discussed how to leverage spatial cues from multi-channel speech. This paper proposes a three-stage modular system&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.16803v1-abstract-full').style.display = 'inline'; document.getElementById('2409.16803v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.16803v1-abstract-full" style="display: none;"> Although fully end-to-end speaker diarization systems have made significant progress in recent years, modular systems often achieve superior results in real-world scenarios due to their greater adaptability and robustness. Historically, modular speaker diarization methods have seldom discussed how to leverage spatial cues from multi-channel speech. This paper proposes a three-stage modular system to enhance single-channel neural speaker diarization systems and recognition performance by utilizing spatial cues from multi-channel speech to provide more accurate initialization for each stage of neural speaker diarization (NSD) decoding: (1) Overlap detection and continuous speech separation (CSS) on multi-channel speech are used to obtain cleaner single speaker speech segments for clustering, followed by the first NSD decoding pass. (2) The results from the first pass initialize a complex Angular Central Gaussian Mixture Model (cACGMM) to estimate speaker-wise masks on multi-channel speech, and through Overlap-add and Mask-to-VAD, achieve initialization with lower speaker error (SpkErr), followed by the second NSD decoding pass. (3) The second decoding results are used for guided source separation (GSS), recognizing and filtering short segments containing less one word to obtain cleaner speech segments, followed by re-clustering and the final NSD decoding pass. We presented the progressively explored evaluation results from the CHiME-8 NOTSOFAR-1 (Natural Office Talkers in Settings Of Far-field Audio Recordings) challenge, demonstrating the effectiveness of our system and its contribution to improving recognition performance. Our final system achieved the first place in the challenge. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.16803v1-abstract-full').style.display = 'none'; document.getElementById('2409.16803v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, Submitted to ICASSP 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.13913">arXiv:2409.13913</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.13913">pdf</a>, <a href="https://arxiv.org/format/2409.13913">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Target word activity detector: An approach to obtain ASR word boundaries without lexicon </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Sivasankaran%2C+S">Sunit Sivasankaran</a>, <a href="/search/eess?searchtype=author&amp;query=Sun%2C+E">Eric Sun</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+J">Jinyu Li</a>, <a href="/search/eess?searchtype=author&amp;query=Huang%2C+Y">Yan Huang</a>, <a href="/search/eess?searchtype=author&amp;query=Pan%2C+J">Jing Pan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.13913v1-abstract-short" style="display: inline;"> Obtaining word timestamp information from end-to-end (E2E) ASR models remains challenging due to the lack of explicit time alignment during training. This issue is further complicated in multilingual models. Existing methods, either rely on lexicons or introduce additional tokens, leading to scalability issues and increased computational costs. In this work, we propose a new approach to estimate w&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13913v1-abstract-full').style.display = 'inline'; document.getElementById('2409.13913v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.13913v1-abstract-full" style="display: none;"> Obtaining word timestamp information from end-to-end (E2E) ASR models remains challenging due to the lack of explicit time alignment during training. This issue is further complicated in multilingual models. Existing methods, either rely on lexicons or introduce additional tokens, leading to scalability issues and increased computational costs. In this work, we propose a new approach to estimate word boundaries without relying on lexicons. Our method leverages word embeddings from sub-word token units and a pretrained ASR model, requiring only word alignment information during training. Our proposed method can scale-up to any number of languages without incurring any additional cost. We validate our approach using a multilingual ASR model trained on five languages and demonstrate its effectiveness against a strong baseline. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13913v1-abstract-full').style.display = 'none'; document.getElementById('2409.13913v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to ICASSP 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.02041">arXiv:2409.02041</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.02041">pdf</a>, <a href="https://arxiv.org/format/2409.02041">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> The USTC-NERCSLIP Systems for the CHiME-8 NOTSOFAR-1 Challenge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Niu%2C+S">Shutong Niu</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+R">Ruoyu Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Du%2C+J">Jun Du</a>, <a href="/search/eess?searchtype=author&amp;query=Yang%2C+G">Gaobin Yang</a>, <a href="/search/eess?searchtype=author&amp;query=Tu%2C+Y">Yanhui Tu</a>, <a href="/search/eess?searchtype=author&amp;query=Wu%2C+S">Siyuan Wu</a>, <a href="/search/eess?searchtype=author&amp;query=Qian%2C+S">Shuangqing Qian</a>, <a href="/search/eess?searchtype=author&amp;query=Wu%2C+H">Huaxin Wu</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+H">Haitao Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+X">Xueyang Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Zhong%2C+G">Guolong Zhong</a>, <a href="/search/eess?searchtype=author&amp;query=Yu%2C+X">Xindi Yu</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+J">Jieru Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+M">Mengzhi Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Cai%2C+D">Di Cai</a>, <a href="/search/eess?searchtype=author&amp;query=Gao%2C+T">Tian Gao</a>, <a href="/search/eess?searchtype=author&amp;query=Wan%2C+G">Genshun Wan</a>, <a href="/search/eess?searchtype=author&amp;query=Ma%2C+F">Feng Ma</a>, <a href="/search/eess?searchtype=author&amp;query=Pan%2C+J">Jia Pan</a>, <a href="/search/eess?searchtype=author&amp;query=Gao%2C+J">Jianqing Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.02041v2-abstract-short" style="display: inline;"> This technical report outlines our submission system for the CHiME-8 NOTSOFAR-1 Challenge. The primary difficulty of this challenge is the dataset recorded across various conference rooms, which captures real-world complexities such as high overlap rates, background noises, a variable number of speakers, and natural conversation styles. To address these issues, we optimized the system in several a&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.02041v2-abstract-full').style.display = 'inline'; document.getElementById('2409.02041v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.02041v2-abstract-full" style="display: none;"> This technical report outlines our submission system for the CHiME-8 NOTSOFAR-1 Challenge. The primary difficulty of this challenge is the dataset recorded across various conference rooms, which captures real-world complexities such as high overlap rates, background noises, a variable number of speakers, and natural conversation styles. To address these issues, we optimized the system in several aspects: For front-end speech signal processing, we introduced a data-driven joint training method for diarization and separation (JDS) to enhance audio quality. Additionally, we also integrated traditional guided source separation (GSS) for multi-channel track to provide complementary information for the JDS. For back-end speech recognition, we enhanced Whisper with WavLM, ConvNeXt, and Transformer innovations, applying multi-task training and Noise KLD augmentation, to significantly advance ASR robustness and accuracy. Our system attained a Time-Constrained minimum Permutation Word Error Rate (tcpWER) of 14.265% and 22.989% on the CHiME-8 NOTSOFAR-1 Dev-set-2 multi-channel and single-channel tracks, respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.02041v2-abstract-full').style.display = 'none'; document.getElementById('2409.02041v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.17175">arXiv:2408.17175</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.17175">pdf</a>, <a href="https://arxiv.org/format/2408.17175">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Codec Does Matter: Exploring the Semantic Shortcoming of Codec for Audio Language Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Ye%2C+Z">Zhen Ye</a>, <a href="/search/eess?searchtype=author&amp;query=Sun%2C+P">Peiwen Sun</a>, <a href="/search/eess?searchtype=author&amp;query=Lei%2C+J">Jiahe Lei</a>, <a href="/search/eess?searchtype=author&amp;query=Lin%2C+H">Hongzhan Lin</a>, <a href="/search/eess?searchtype=author&amp;query=Tan%2C+X">Xu Tan</a>, <a href="/search/eess?searchtype=author&amp;query=Dai%2C+Z">Zheqi Dai</a>, <a href="/search/eess?searchtype=author&amp;query=Kong%2C+Q">Qiuqiang Kong</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+J">Jianyi Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Pan%2C+J">Jiahao Pan</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+Q">Qifeng Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Guo%2C+Y">Yike Guo</a>, <a href="/search/eess?searchtype=author&amp;query=Xue%2C+W">Wei Xue</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.17175v3-abstract-short" style="display: inline;"> Recent advancements in audio generation have been significantly propelled by the capabilities of Large Language Models (LLMs). The existing research on audio LLM has primarily focused on enhancing the architecture and scale of audio language models, as well as leveraging larger datasets, and generally, acoustic codecs, such as EnCodec, are used for audio tokenization. However, these codecs were or&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.17175v3-abstract-full').style.display = 'inline'; document.getElementById('2408.17175v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.17175v3-abstract-full" style="display: none;"> Recent advancements in audio generation have been significantly propelled by the capabilities of Large Language Models (LLMs). The existing research on audio LLM has primarily focused on enhancing the architecture and scale of audio language models, as well as leveraging larger datasets, and generally, acoustic codecs, such as EnCodec, are used for audio tokenization. However, these codecs were originally designed for audio compression, which may lead to suboptimal performance in the context of audio LLM. Our research aims to address the shortcomings of current audio LLM codecs, particularly their challenges in maintaining semantic integrity in generated audio. For instance, existing methods like VALL-E, which condition acoustic token generation on text transcriptions, often suffer from content inaccuracies and elevated word error rates (WER) due to semantic misinterpretations of acoustic tokens, resulting in word skipping and errors. To overcome these issues, we propose a straightforward yet effective approach called X-Codec. X-Codec incorporates semantic features from a pre-trained semantic encoder before the Residual Vector Quantization (RVQ) stage and introduces a semantic reconstruction loss after RVQ. By enhancing the semantic ability of the codec, X-Codec significantly reduces WER in speech synthesis tasks and extends these benefits to non-speech applications, including music and sound generation. Our experiments in text-to-speech, music continuation, and text-to-sound tasks demonstrate that integrating semantic information substantially improves the overall performance of language models in audio generation. Our code and demo are available (Demo: https://x-codec-audio.github.io Code: https://github.com/zhenye234/xcodec) <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.17175v3-abstract-full').style.display = 'none'; document.getElementById('2408.17175v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.20962">arXiv:2407.20962</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.20962">pdf</a>, <a href="https://arxiv.org/format/2407.20962">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> MMTrail: A Multimodal Trailer Video Dataset with Language and Music Descriptions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Chi%2C+X">Xiaowei Chi</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+Y">Yatian Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Cheng%2C+A">Aosong Cheng</a>, <a href="/search/eess?searchtype=author&amp;query=Fang%2C+P">Pengjun Fang</a>, <a href="/search/eess?searchtype=author&amp;query=Tian%2C+Z">Zeyue Tian</a>, <a href="/search/eess?searchtype=author&amp;query=He%2C+Y">Yingqing He</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+Z">Zhaoyang Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Qi%2C+X">Xingqun Qi</a>, <a href="/search/eess?searchtype=author&amp;query=Pan%2C+J">Jiahao Pan</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+R">Rongyu Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+M">Mengfei Li</a>, <a href="/search/eess?searchtype=author&amp;query=Yuan%2C+R">Ruibin Yuan</a>, <a href="/search/eess?searchtype=author&amp;query=Jiang%2C+Y">Yanbing Jiang</a>, <a href="/search/eess?searchtype=author&amp;query=Xue%2C+W">Wei Xue</a>, <a href="/search/eess?searchtype=author&amp;query=Luo%2C+W">Wenhan Luo</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+Q">Qifeng Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+S">Shanghang Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+Q">Qifeng Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Guo%2C+Y">Yike Guo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.20962v2-abstract-short" style="display: inline;"> Massive multi-modality datasets play a significant role in facilitating the success of large video-language models. However, current video-language datasets primarily provide text descriptions for visual frames, considering audio to be weakly related information. They usually overlook exploring the potential of inherent audio-visual correlation, leading to monotonous annotation within each modalit&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.20962v2-abstract-full').style.display = 'inline'; document.getElementById('2407.20962v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.20962v2-abstract-full" style="display: none;"> Massive multi-modality datasets play a significant role in facilitating the success of large video-language models. However, current video-language datasets primarily provide text descriptions for visual frames, considering audio to be weakly related information. They usually overlook exploring the potential of inherent audio-visual correlation, leading to monotonous annotation within each modality instead of comprehensive and precise descriptions. Such ignorance results in the difficulty of multiple cross-modality studies. To fulfill this gap, we present MMTrail, a large-scale multi-modality video-language dataset incorporating more than 20M trailer clips with visual captions, and 2M high-quality clips with multimodal captions. Trailers preview full-length video works and integrate context, visual frames, and background music. In particular, the trailer has two main advantages: (1) the topics are diverse, and the content characters are of various types, e.g., film, news, and gaming. (2) the corresponding background music is custom-designed, making it more coherent with the visual context. Upon these insights, we propose a systemic captioning framework, achieving various modality annotations with more than 27.1k hours of trailer videos. Here, to ensure the caption retains music perspective while preserving the authority of visual context, we leverage the advanced LLM to merge all annotations adaptively. In this fashion, our MMtrail dataset potentially paves the path for fine-grained large multimodal-language model training. In experiments, we provide evaluation metrics and benchmark results on our dataset, demonstrating the high quality of our annotation and its effectiveness for model training. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.20962v2-abstract-full').style.display = 'none'; document.getElementById('2407.20962v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 Pages. Dataset report</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.20108">arXiv:2407.20108</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.20108">pdf</a>, <a href="https://arxiv.org/format/2407.20108">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Classification, Regression and Segmentation directly from k-Space in Cardiac MRI </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Li%2C+R">Ruochen Li</a>, <a href="/search/eess?searchtype=author&amp;query=Pan%2C+J">Jiazhen Pan</a>, <a href="/search/eess?searchtype=author&amp;query=Zhu%2C+Y">Youxiang Zhu</a>, <a href="/search/eess?searchtype=author&amp;query=Ni%2C+J">Juncheng Ni</a>, <a href="/search/eess?searchtype=author&amp;query=Rueckert%2C+D">Daniel Rueckert</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.20108v1-abstract-short" style="display: inline;"> Cardiac Magnetic Resonance Imaging (CMR) is the gold standard for diagnosing cardiovascular diseases. Clinical diagnoses predominantly rely on magnitude-only Digital Imaging and Communications in Medicine (DICOM) images, omitting crucial phase information that might provide additional diagnostic benefits. In contrast, k-space is complex-valued and encompasses both magnitude and phase information,&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.20108v1-abstract-full').style.display = 'inline'; document.getElementById('2407.20108v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.20108v1-abstract-full" style="display: none;"> Cardiac Magnetic Resonance Imaging (CMR) is the gold standard for diagnosing cardiovascular diseases. Clinical diagnoses predominantly rely on magnitude-only Digital Imaging and Communications in Medicine (DICOM) images, omitting crucial phase information that might provide additional diagnostic benefits. In contrast, k-space is complex-valued and encompasses both magnitude and phase information, while humans cannot directly perceive. In this work, we propose KMAE, a Transformer-based model specifically designed to process k-space data directly, eliminating conventional intermediary conversion steps to the image domain. KMAE can handle critical cardiac disease classification, relevant phenotype regression, and cardiac morphology segmentation tasks. We utilize this model to investigate the potential of k-space-based diagnosis in cardiac MRI. Notably, this model achieves competitive classification and regression performance compared to image-domain methods e.g. Masked Autoencoders (MAEs) and delivers satisfactory segmentation performance with a myocardium dice score of 0.884. Last but not least, our model exhibits robust performance with consistent results even when the k-space is 8* undersampled. We encourage the MR community to explore the untapped potential of k-space and pursue end-to-end, automated diagnosis with reduced human intervention. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.20108v1-abstract-full').style.display = 'none'; document.getElementById('2407.20108v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.19274">arXiv:2407.19274</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.19274">pdf</a>, <a href="https://arxiv.org/format/2407.19274">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Mamba? Catch The Hype Or Rethink What Really Helps for Image Registration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Jian%2C+B">Bailiang Jian</a>, <a href="/search/eess?searchtype=author&amp;query=Pan%2C+J">Jiazhen Pan</a>, <a href="/search/eess?searchtype=author&amp;query=Ghahremani%2C+M">Morteza Ghahremani</a>, <a href="/search/eess?searchtype=author&amp;query=Rueckert%2C+D">Daniel Rueckert</a>, <a href="/search/eess?searchtype=author&amp;query=Wachinger%2C+C">Christian Wachinger</a>, <a href="/search/eess?searchtype=author&amp;query=Wiestler%2C+B">Benedikt Wiestler</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.19274v1-abstract-short" style="display: inline;"> Our findings indicate that adopting &#34;advanced&#34; computational elements fails to significantly improve registration accuracy. Instead, well-established registration-specific designs offer fair improvements, enhancing results by a marginal 1.5\% over the baseline. Our findings emphasize the importance of rigorous, unbiased evaluation and contribution disentanglement of all low- and high-level registr&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.19274v1-abstract-full').style.display = 'inline'; document.getElementById('2407.19274v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.19274v1-abstract-full" style="display: none;"> Our findings indicate that adopting &#34;advanced&#34; computational elements fails to significantly improve registration accuracy. Instead, well-established registration-specific designs offer fair improvements, enhancing results by a marginal 1.5\% over the baseline. Our findings emphasize the importance of rigorous, unbiased evaluation and contribution disentanglement of all low- and high-level registration components, rather than simply following the computer vision trends with &#34;more advanced&#34; computational blocks. We advocate for simpler yet effective solutions and novel evaluation metrics that go beyond conventional registration accuracy, warranting further research across diverse organs and modalities. The code is available at \url{https://github.com/BailiangJ/rethink-reg}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.19274v1-abstract-full').style.display = 'none'; document.getElementById('2407.19274v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">WBIR 2024 Workshop on Biomedical Imaging Registration</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.16872">arXiv:2406.16872</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.16872">pdf</a>, <a href="https://arxiv.org/format/2406.16872">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Multi-channel Time Series Decomposition Network For Generalizable Sensor-Based Activity Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Pan%2C+J">Jianguo Pan</a>, <a href="/search/eess?searchtype=author&amp;query=Hu%2C+Z">Zhengxin Hu</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+L">Lingdun Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Cai%2C+X">Xia Cai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.16872v1-abstract-short" style="display: inline;"> Sensor-based human activity recognition is important in daily scenarios such as smart healthcare and homes due to its non-intrusive privacy and low cost advantages, but the problem of out-of-domain generalization caused by differences in focusing individuals and operating environments can lead to significant accuracy degradation on cross-person behavior recognition due to the inconsistent distribu&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.16872v1-abstract-full').style.display = 'inline'; document.getElementById('2406.16872v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.16872v1-abstract-full" style="display: none;"> Sensor-based human activity recognition is important in daily scenarios such as smart healthcare and homes due to its non-intrusive privacy and low cost advantages, but the problem of out-of-domain generalization caused by differences in focusing individuals and operating environments can lead to significant accuracy degradation on cross-person behavior recognition due to the inconsistent distributions of training and test data. To address the above problems, this paper proposes a new method, Multi-channel Time Series Decomposition Network (MTSDNet). Firstly, MTSDNet decomposes the original signal into a combination of multiple polynomials and trigonometric functions by the trainable parameterized temporal decomposition to learn the low-rank representation of the original signal for improving the extraterritorial generalization ability of the model. Then, the different components obtained by the decomposition are classified layer by layer and the layer attention is used to aggregate components to obtain the final classification result. Extensive evaluation on DSADS, OPPORTUNITY, PAMAP2, UCIHAR and UniMib public datasets shows the advantages in predicting accuracy and stability of our method compared with other competing strategies, including the state-of-the-art ones. And the visualization is conducted to reveal MTSDNet&#39;s interpretability and layer-by-layer characteristics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.16872v1-abstract-full').style.display = 'none'; document.getElementById('2406.16872v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.02430">arXiv:2406.02430</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.02430">pdf</a>, <a href="https://arxiv.org/format/2406.02430">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Seed-TTS: A Family of High-Quality Versatile Speech Generation Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Anastassiou%2C+P">Philip Anastassiou</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+J">Jiawei Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+J">Jitong Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+Y">Yuanzhe Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+Z">Zhuo Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+Z">Ziyi Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Cong%2C+J">Jian Cong</a>, <a href="/search/eess?searchtype=author&amp;query=Deng%2C+L">Lelai Deng</a>, <a href="/search/eess?searchtype=author&amp;query=Ding%2C+C">Chuang Ding</a>, <a href="/search/eess?searchtype=author&amp;query=Gao%2C+L">Lu Gao</a>, <a href="/search/eess?searchtype=author&amp;query=Gong%2C+M">Mingqing Gong</a>, <a href="/search/eess?searchtype=author&amp;query=Huang%2C+P">Peisong Huang</a>, <a href="/search/eess?searchtype=author&amp;query=Huang%2C+Q">Qingqing Huang</a>, <a href="/search/eess?searchtype=author&amp;query=Huang%2C+Z">Zhiying Huang</a>, <a href="/search/eess?searchtype=author&amp;query=Huo%2C+Y">Yuanyuan Huo</a>, <a href="/search/eess?searchtype=author&amp;query=Jia%2C+D">Dongya Jia</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+C">Chumin Li</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+F">Feiya Li</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+H">Hui Li</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+J">Jiaxin Li</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+X">Xiaoyang Li</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+X">Xingxing Li</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+L">Lin Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+S">Shouda Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+S">Sichao Liu</a> , et al. (21 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.02430v1-abstract-short" style="display: inline;"> We introduce Seed-TTS, a family of large-scale autoregressive text-to-speech (TTS) models capable of generating speech that is virtually indistinguishable from human speech. Seed-TTS serves as a foundation model for speech generation and excels in speech in-context learning, achieving performance in speaker similarity and naturalness that matches ground truth human speech in both objective and sub&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.02430v1-abstract-full').style.display = 'inline'; document.getElementById('2406.02430v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.02430v1-abstract-full" style="display: none;"> We introduce Seed-TTS, a family of large-scale autoregressive text-to-speech (TTS) models capable of generating speech that is virtually indistinguishable from human speech. Seed-TTS serves as a foundation model for speech generation and excels in speech in-context learning, achieving performance in speaker similarity and naturalness that matches ground truth human speech in both objective and subjective evaluations. With fine-tuning, we achieve even higher subjective scores across these metrics. Seed-TTS offers superior controllability over various speech attributes such as emotion and is capable of generating highly expressive and diverse speech for speakers in the wild. Furthermore, we propose a self-distillation method for speech factorization, as well as a reinforcement learning approach to enhance model robustness, speaker similarity, and controllability. We additionally present a non-autoregressive (NAR) variant of the Seed-TTS model, named $\text{Seed-TTS}_\text{DiT}$, which utilizes a fully diffusion-based architecture. Unlike previous NAR-based TTS systems, $\text{Seed-TTS}_\text{DiT}$ does not depend on pre-estimated phoneme durations and performs speech generation through end-to-end processing. We demonstrate that this variant achieves comparable performance to the language model-based variant and showcase its effectiveness in speech editing. We encourage readers to listen to demos at \url{https://bytedancespeech.github.io/seedtts_tech_report}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.02430v1-abstract-full').style.display = 'none'; document.getElementById('2406.02430v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.00329">arXiv:2406.00329</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.00329">pdf</a>, <a href="https://arxiv.org/format/2406.00329">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Whole Heart 3D+T Representation Learning Through Sparse 2D Cardiac MR Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+Y">Yundi Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+C">Chen Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Shit%2C+S">Suprosanna Shit</a>, <a href="/search/eess?searchtype=author&amp;query=Starck%2C+S">Sophie Starck</a>, <a href="/search/eess?searchtype=author&amp;query=Rueckert%2C+D">Daniel Rueckert</a>, <a href="/search/eess?searchtype=author&amp;query=Pan%2C+J">Jiazhen Pan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.00329v2-abstract-short" style="display: inline;"> Cardiac Magnetic Resonance (CMR) imaging serves as the gold-standard for evaluating cardiac morphology and function. Typically, a multi-view CMR stack, covering short-axis (SA) and 2/3/4-chamber long-axis (LA) views, is acquired for a thorough cardiac assessment. However, efficiently streamlining the complex, high-dimensional 3D+T CMR data and distilling compact, coherent representation remains a&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.00329v2-abstract-full').style.display = 'inline'; document.getElementById('2406.00329v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.00329v2-abstract-full" style="display: none;"> Cardiac Magnetic Resonance (CMR) imaging serves as the gold-standard for evaluating cardiac morphology and function. Typically, a multi-view CMR stack, covering short-axis (SA) and 2/3/4-chamber long-axis (LA) views, is acquired for a thorough cardiac assessment. However, efficiently streamlining the complex, high-dimensional 3D+T CMR data and distilling compact, coherent representation remains a challenge. In this work, we introduce a whole-heart self-supervised learning framework that utilizes masked imaging modeling to automatically uncover the correlations between spatial and temporal patches throughout the cardiac stacks. This process facilitates the generation of meaningful and well-clustered heart representations without relying on the traditionally required, and often costly, labeled data. The learned heart representation can be directly used for various downstream tasks. Furthermore, our method demonstrates remarkable robustness, ensuring consistent representations even when certain CMR planes are missing/flawed. We train our model on 14,000 unlabeled CMR data from UK BioBank and evaluate it on 1,000 annotated data. The proposed method demonstrates superior performance to baselines in tasks that demand comprehensive 3D+T cardiac information, e.g. cardiac phenotype (ejection fraction and ventricle volume) prediction and multi-plane/multi-frame CMR segmentation, highlighting its effectiveness in extracting comprehensive cardiac features that are both anatomically and pathologically relevant. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.00329v2-abstract-full').style.display = 'none'; document.getElementById('2406.00329v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.00192">arXiv:2406.00192</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.00192">pdf</a>, <a href="https://arxiv.org/format/2406.00192">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Direct Cardiac Segmentation from Undersampled K-space Using Transformers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+Y">Yundi Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Stolt-Ans%C3%B3%2C+N">Nil Stolt-Ans贸</a>, <a href="/search/eess?searchtype=author&amp;query=Pan%2C+J">Jiazhen Pan</a>, <a href="/search/eess?searchtype=author&amp;query=Huang%2C+W">Wenqi Huang</a>, <a href="/search/eess?searchtype=author&amp;query=Hammernik%2C+K">Kerstin Hammernik</a>, <a href="/search/eess?searchtype=author&amp;query=Rueckert%2C+D">Daniel Rueckert</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.00192v1-abstract-short" style="display: inline;"> The prevailing deep learning-based methods of predicting cardiac segmentation involve reconstructed magnetic resonance (MR) images. The heavy dependency of segmentation approaches on image quality significantly limits the acceleration rate in fast MR reconstruction. Moreover, the practice of treating reconstruction and segmentation as separate sequential processes leads to artifact generation and&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.00192v1-abstract-full').style.display = 'inline'; document.getElementById('2406.00192v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.00192v1-abstract-full" style="display: none;"> The prevailing deep learning-based methods of predicting cardiac segmentation involve reconstructed magnetic resonance (MR) images. The heavy dependency of segmentation approaches on image quality significantly limits the acceleration rate in fast MR reconstruction. Moreover, the practice of treating reconstruction and segmentation as separate sequential processes leads to artifact generation and information loss in the intermediate stage. These issues pose a great risk to achieving high-quality outcomes. To leverage the redundant k-space information overlooked in this dual-step pipeline, we introduce a novel approach to directly deriving segmentations from sparse k-space samples using a transformer (DiSK). DiSK operates by globally extracting latent features from 2D+time k-space data with attention blocks and subsequently predicting the segmentation label of query points. We evaluate our model under various acceleration factors (ranging from 4 to 64) and compare against two image-based segmentation baselines. Our model consistently outperforms the baselines in Dice and Hausdorff distances across foreground classes for all presented sampling rates. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.00192v1-abstract-full').style.display = 'none'; document.getElementById('2406.00192v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.16952">arXiv:2405.16952</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2405.16952">pdf</a>, <a href="https://arxiv.org/format/2405.16952">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> A Variance-Preserving Interpolation Approach for Diffusion Models with Applications to Single Channel Speech Enhancement and Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Guo%2C+Z">Zilu Guo</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+Q">Qing Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Du%2C+J">Jun Du</a>, <a href="/search/eess?searchtype=author&amp;query=Pan%2C+J">Jia Pan</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+Q">Qing-Feng Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Chin-Hui"> Chin-Hui</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.16952v1-abstract-short" style="display: inline;"> In this paper, we propose a variance-preserving interpolation framework to improve diffusion models for single-channel speech enhancement (SE) and automatic speech recognition (ASR). This new variance-preserving interpolation diffusion model (VPIDM) approach requires only 25 iterative steps and obviates the need for a corrector, an essential element in the existing variance-exploding interpolation&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.16952v1-abstract-full').style.display = 'inline'; document.getElementById('2405.16952v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.16952v1-abstract-full" style="display: none;"> In this paper, we propose a variance-preserving interpolation framework to improve diffusion models for single-channel speech enhancement (SE) and automatic speech recognition (ASR). This new variance-preserving interpolation diffusion model (VPIDM) approach requires only 25 iterative steps and obviates the need for a corrector, an essential element in the existing variance-exploding interpolation diffusion model (VEIDM). Two notable distinctions between VPIDM and VEIDM are the scaling function of the mean of state variables and the constraint imposed on the variance relative to the mean&#39;s scale. We conduct a systematic exploration of the theoretical mechanism underlying VPIDM and develop insights regarding VPIDM&#39;s applications in SE and ASR using VPIDM as a frontend. Our proposed approach, evaluated on two distinct data sets, demonstrates VPIDM&#39;s superior performances over conventional discriminative SE algorithms. Furthermore, we assess the performance of the proposed model under varying signal-to-noise ratio (SNR) levels. The investigation reveals VPIDM&#39;s improved robustness in target noise elimination when compared to VEIDM. Furthermore, utilizing the mid-outputs of both VPIDM and VEIDM results in enhanced ASR accuracies, thereby highlighting the practical efficacy of our proposed approach. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.16952v1-abstract-full').style.display = 'none'; document.getElementById('2405.16952v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.18081">arXiv:2404.18081</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2404.18081">pdf</a>, <a href="https://arxiv.org/format/2404.18081">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> ComposerX: Multi-Agent Symbolic Music Composition with LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Deng%2C+Q">Qixin Deng</a>, <a href="/search/eess?searchtype=author&amp;query=Yang%2C+Q">Qikai Yang</a>, <a href="/search/eess?searchtype=author&amp;query=Yuan%2C+R">Ruibin Yuan</a>, <a href="/search/eess?searchtype=author&amp;query=Huang%2C+Y">Yipeng Huang</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+Y">Yi Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+X">Xubo Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Tian%2C+Z">Zeyue Tian</a>, <a href="/search/eess?searchtype=author&amp;query=Pan%2C+J">Jiahao Pan</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+G">Ge Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Lin%2C+H">Hanfeng Lin</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+Y">Yizhi Li</a>, <a href="/search/eess?searchtype=author&amp;query=Ma%2C+Y">Yinghao Ma</a>, <a href="/search/eess?searchtype=author&amp;query=Fu%2C+J">Jie Fu</a>, <a href="/search/eess?searchtype=author&amp;query=Lin%2C+C">Chenghua Lin</a>, <a href="/search/eess?searchtype=author&amp;query=Benetos%2C+E">Emmanouil Benetos</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+W">Wenwu Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Xia%2C+G">Guangyu Xia</a>, <a href="/search/eess?searchtype=author&amp;query=Xue%2C+W">Wei Xue</a>, <a href="/search/eess?searchtype=author&amp;query=Guo%2C+Y">Yike Guo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.18081v2-abstract-short" style="display: inline;"> Music composition represents the creative side of humanity, and itself is a complex task that requires abilities to understand and generate information with long dependency and harmony constraints. While demonstrating impressive capabilities in STEM subjects, current LLMs easily fail in this task, generating ill-written music even when equipped with modern techniques like In-Context-Learning and C&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.18081v2-abstract-full').style.display = 'inline'; document.getElementById('2404.18081v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.18081v2-abstract-full" style="display: none;"> Music composition represents the creative side of humanity, and itself is a complex task that requires abilities to understand and generate information with long dependency and harmony constraints. While demonstrating impressive capabilities in STEM subjects, current LLMs easily fail in this task, generating ill-written music even when equipped with modern techniques like In-Context-Learning and Chain-of-Thoughts. To further explore and enhance LLMs&#39; potential in music composition by leveraging their reasoning ability and the large knowledge base in music history and theory, we propose ComposerX, an agent-based symbolic music generation framework. We find that applying a multi-agent approach significantly improves the music composition quality of GPT-4. The results demonstrate that ComposerX is capable of producing coherent polyphonic music compositions with captivating melodies, while adhering to user instructions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.18081v2-abstract-full').style.display = 'none'; document.getElementById('2404.18081v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.17621">arXiv:2404.17621</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2404.17621">pdf</a>, <a href="https://arxiv.org/format/2404.17621">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TMI.2024.3385024">10.1109/TMI.2024.3385024 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Attention-aware non-rigid image registration for accelerated MR imaging </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Ghoul%2C+A">Aya Ghoul</a>, <a href="/search/eess?searchtype=author&amp;query=Pan%2C+J">Jiazhen Pan</a>, <a href="/search/eess?searchtype=author&amp;query=Lingg%2C+A">Andreas Lingg</a>, <a href="/search/eess?searchtype=author&amp;query=K%C3%BCbler%2C+J">Jens K眉bler</a>, <a href="/search/eess?searchtype=author&amp;query=Krumm%2C+P">Patrick Krumm</a>, <a href="/search/eess?searchtype=author&amp;query=Hammernik%2C+K">Kerstin Hammernik</a>, <a href="/search/eess?searchtype=author&amp;query=Rueckert%2C+D">Daniel Rueckert</a>, <a href="/search/eess?searchtype=author&amp;query=Gatidis%2C+S">Sergios Gatidis</a>, <a href="/search/eess?searchtype=author&amp;query=K%C3%BCstner%2C+T">Thomas K眉stner</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.17621v1-abstract-short" style="display: inline;"> Accurate motion estimation at high acceleration factors enables rapid motion-compensated reconstruction in Magnetic Resonance Imaging (MRI) without compromising the diagnostic image quality. In this work, we introduce an attention-aware deep learning-based framework that can perform non-rigid pairwise registration for fully sampled and accelerated MRI. We extract local visual representations to bu&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.17621v1-abstract-full').style.display = 'inline'; document.getElementById('2404.17621v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.17621v1-abstract-full" style="display: none;"> Accurate motion estimation at high acceleration factors enables rapid motion-compensated reconstruction in Magnetic Resonance Imaging (MRI) without compromising the diagnostic image quality. In this work, we introduce an attention-aware deep learning-based framework that can perform non-rigid pairwise registration for fully sampled and accelerated MRI. We extract local visual representations to build similarity maps between the registered image pairs at multiple resolution levels and additionally leverage long-range contextual information using a transformer-based module to alleviate ambiguities in the presence of artifacts caused by undersampling. We combine local and global dependencies to perform simultaneous coarse and fine motion estimation. The proposed method was evaluated on in-house acquired fully sampled and accelerated data of 101 patients and 62 healthy subjects undergoing cardiac and thoracic MRI. The impact of motion estimation accuracy on the downstream task of motion-compensated reconstruction was analyzed. We demonstrate that our model derives reliable and consistent motion fields across different sampling trajectories (Cartesian and radial) and acceleration factors of up to 16x for cardiac motion and 30x for respiratory motion and achieves superior image quality in motion-compensated reconstruction qualitatively and quantitatively compared to conventional and recent deep learning-based approaches. The code is publicly available at https://github.com/lab-midas/GMARAFT. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.17621v1-abstract-full').style.display = 'none'; document.getElementById('2404.17621v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages, 7 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.16484">arXiv:2404.16484</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2404.16484">pdf</a>, <a href="https://arxiv.org/format/2404.16484">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Real-Time 4K Super-Resolution of Compressed AVIF Images. AIS 2024 Challenge Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Conde%2C+M+V">Marcos V. Conde</a>, <a href="/search/eess?searchtype=author&amp;query=Lei%2C+Z">Zhijun Lei</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+W">Wen Li</a>, <a href="/search/eess?searchtype=author&amp;query=Stejerean%2C+C">Cosmin Stejerean</a>, <a href="/search/eess?searchtype=author&amp;query=Katsavounidis%2C+I">Ioannis Katsavounidis</a>, <a href="/search/eess?searchtype=author&amp;query=Timofte%2C+R">Radu Timofte</a>, <a href="/search/eess?searchtype=author&amp;query=Yoon%2C+K">Kihwan Yoon</a>, <a href="/search/eess?searchtype=author&amp;query=Gankhuyag%2C+G">Ganzorig Gankhuyag</a>, <a href="/search/eess?searchtype=author&amp;query=Lv%2C+J">Jiangtao Lv</a>, <a href="/search/eess?searchtype=author&amp;query=Sun%2C+L">Long Sun</a>, <a href="/search/eess?searchtype=author&amp;query=Pan%2C+J">Jinshan Pan</a>, <a href="/search/eess?searchtype=author&amp;query=Dong%2C+J">Jiangxin Dong</a>, <a href="/search/eess?searchtype=author&amp;query=Tang%2C+J">Jinhui Tang</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+Z">Zhiyuan Li</a>, <a href="/search/eess?searchtype=author&amp;query=Wei%2C+H">Hao Wei</a>, <a href="/search/eess?searchtype=author&amp;query=Ge%2C+C">Chenyang Ge</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+D">Dongyang Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+T">Tianle Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+H">Huaian Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Jin%2C+Y">Yi Jin</a>, <a href="/search/eess?searchtype=author&amp;query=Zhou%2C+M">Menghan Zhou</a>, <a href="/search/eess?searchtype=author&amp;query=Yan%2C+Y">Yiqiang Yan</a>, <a href="/search/eess?searchtype=author&amp;query=Gao%2C+S">Si Gao</a>, <a href="/search/eess?searchtype=author&amp;query=Wu%2C+B">Biao Wu</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+S">Shaoli Liu</a> , et al. (50 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.16484v1-abstract-short" style="display: inline;"> This paper introduces a novel benchmark as part of the AIS 2024 Real-Time Image Super-Resolution (RTSR) Challenge, which aims to upscale compressed images from 540p to 4K resolution (4x factor) in real-time on commercial GPUs. For this, we use a diverse test set containing a variety of 4K images ranging from digital art to gaming and photography. The images are compressed using the modern AVIF cod&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.16484v1-abstract-full').style.display = 'inline'; document.getElementById('2404.16484v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.16484v1-abstract-full" style="display: none;"> This paper introduces a novel benchmark as part of the AIS 2024 Real-Time Image Super-Resolution (RTSR) Challenge, which aims to upscale compressed images from 540p to 4K resolution (4x factor) in real-time on commercial GPUs. For this, we use a diverse test set containing a variety of 4K images ranging from digital art to gaming and photography. The images are compressed using the modern AVIF codec, instead of JPEG. All the proposed methods improve PSNR fidelity over Lanczos interpolation, and process images under 10ms. Out of the 160 participants, 25 teams submitted their code and models. The solutions present novel designs tailored for memory-efficiency and runtime on edge devices. This survey describes the best solutions for real-time SR of compressed high-resolution images. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.16484v1-abstract-full').style.display = 'none'; document.getElementById('2404.16484v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CVPR 2024, AI for Streaming (AIS) Workshop</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.14700">arXiv:2404.14700</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2404.14700">pdf</a>, <a href="https://arxiv.org/format/2404.14700">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> FlashSpeech: Efficient Zero-Shot Speech Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Ye%2C+Z">Zhen Ye</a>, <a href="/search/eess?searchtype=author&amp;query=Ju%2C+Z">Zeqian Ju</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+H">Haohe Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Tan%2C+X">Xu Tan</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+J">Jianyi Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Lu%2C+Y">Yiwen Lu</a>, <a href="/search/eess?searchtype=author&amp;query=Sun%2C+P">Peiwen Sun</a>, <a href="/search/eess?searchtype=author&amp;query=Pan%2C+J">Jiahao Pan</a>, <a href="/search/eess?searchtype=author&amp;query=Bian%2C+W">Weizhen Bian</a>, <a href="/search/eess?searchtype=author&amp;query=He%2C+S">Shulin He</a>, <a href="/search/eess?searchtype=author&amp;query=Xue%2C+W">Wei Xue</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+Q">Qifeng Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Guo%2C+Y">Yike Guo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.14700v4-abstract-short" style="display: inline;"> Recent progress in large-scale zero-shot speech synthesis has been significantly advanced by language models and diffusion models. However, the generation process of both methods is slow and computationally intensive. Efficient speech synthesis using a lower computing budget to achieve quality on par with previous work remains a significant challenge. In this paper, we present FlashSpeech, a large&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.14700v4-abstract-full').style.display = 'inline'; document.getElementById('2404.14700v4-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.14700v4-abstract-full" style="display: none;"> Recent progress in large-scale zero-shot speech synthesis has been significantly advanced by language models and diffusion models. However, the generation process of both methods is slow and computationally intensive. Efficient speech synthesis using a lower computing budget to achieve quality on par with previous work remains a significant challenge. In this paper, we present FlashSpeech, a large-scale zero-shot speech synthesis system with approximately 5\% of the inference time compared with previous work. FlashSpeech is built on the latent consistency model and applies a novel adversarial consistency training approach that can train from scratch without the need for a pre-trained diffusion model as the teacher. Furthermore, a new prosody generator module enhances the diversity of prosody, making the rhythm of the speech sound more natural. The generation processes of FlashSpeech can be achieved efficiently with one or two sampling steps while maintaining high audio quality and high similarity to the audio prompt for zero-shot speech generation. Our experimental results demonstrate the superior performance of FlashSpeech. Notably, FlashSpeech can be about 20 times faster than other zero-shot speech synthesis systems while maintaining comparable performance in terms of voice quality and similarity. Furthermore, FlashSpeech demonstrates its versatility by efficiently performing tasks like voice conversion, speech editing, and diverse speech sampling. Audio samples can be found in https://flashspeech.github.io/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.14700v4-abstract-full').style.display = 'none'; document.getElementById('2404.14700v4-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Efficient zero-shot speech synthesis</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.10343">arXiv:2404.10343</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2404.10343">pdf</a>, <a href="https://arxiv.org/format/2404.10343">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> The Ninth NTIRE 2024 Efficient Super-Resolution Challenge Report </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Ren%2C+B">Bin Ren</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+Y">Yawei Li</a>, <a href="/search/eess?searchtype=author&amp;query=Mehta%2C+N">Nancy Mehta</a>, <a href="/search/eess?searchtype=author&amp;query=Timofte%2C+R">Radu Timofte</a>, <a href="/search/eess?searchtype=author&amp;query=Yu%2C+H">Hongyuan Yu</a>, <a href="/search/eess?searchtype=author&amp;query=Wan%2C+C">Cheng Wan</a>, <a href="/search/eess?searchtype=author&amp;query=Hong%2C+Y">Yuxin Hong</a>, <a href="/search/eess?searchtype=author&amp;query=Han%2C+B">Bingnan Han</a>, <a href="/search/eess?searchtype=author&amp;query=Wu%2C+Z">Zhuoyuan Wu</a>, <a href="/search/eess?searchtype=author&amp;query=Zou%2C+Y">Yajun Zou</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+Y">Yuqing Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+J">Jizhe Li</a>, <a href="/search/eess?searchtype=author&amp;query=He%2C+K">Keji He</a>, <a href="/search/eess?searchtype=author&amp;query=Fan%2C+C">Chao Fan</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+H">Heng Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+X">Xiaolin Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Yin%2C+X">Xuanwu Yin</a>, <a href="/search/eess?searchtype=author&amp;query=Zuo%2C+K">Kunlong Zuo</a>, <a href="/search/eess?searchtype=author&amp;query=Liao%2C+B">Bohao Liao</a>, <a href="/search/eess?searchtype=author&amp;query=Xia%2C+P">Peizhe Xia</a>, <a href="/search/eess?searchtype=author&amp;query=Peng%2C+L">Long Peng</a>, <a href="/search/eess?searchtype=author&amp;query=Du%2C+Z">Zhibo Du</a>, <a href="/search/eess?searchtype=author&amp;query=Di%2C+X">Xin Di</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+W">Wangkai Li</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+Y">Yang Wang</a> , et al. (109 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.10343v2-abstract-short" style="display: inline;"> This paper provides a comprehensive review of the NTIRE 2024 challenge, focusing on efficient single-image super-resolution (ESR) solutions and their outcomes. The task of this challenge is to super-resolve an input image with a magnification factor of x4 based on pairs of low and corresponding high-resolution images. The primary objective is to develop networks that optimize various aspects such&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.10343v2-abstract-full').style.display = 'inline'; document.getElementById('2404.10343v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.10343v2-abstract-full" style="display: none;"> This paper provides a comprehensive review of the NTIRE 2024 challenge, focusing on efficient single-image super-resolution (ESR) solutions and their outcomes. The task of this challenge is to super-resolve an input image with a magnification factor of x4 based on pairs of low and corresponding high-resolution images. The primary objective is to develop networks that optimize various aspects such as runtime, parameters, and FLOPs, while still maintaining a peak signal-to-noise ratio (PSNR) of approximately 26.90 dB on the DIV2K_LSDIR_valid dataset and 26.99 dB on the DIV2K_LSDIR_test dataset. In addition, this challenge has 4 tracks including the main track (overall performance), sub-track 1 (runtime), sub-track 2 (FLOPs), and sub-track 3 (parameters). In the main track, all three metrics (ie runtime, FLOPs, and parameter count) were considered. The ranking of the main track is calculated based on a weighted sum-up of the scores of all other sub-tracks. In sub-track 1, the practical runtime performance of the submissions was evaluated, and the corresponding score was used to determine the ranking. In sub-track 2, the number of FLOPs was considered. The score calculated based on the corresponding FLOPs was used to determine the ranking. In sub-track 3, the number of parameters was considered. The score calculated based on the corresponding parameters was used to determine the ranking. RLFN is set as the baseline for efficiency measurement. The challenge had 262 registered participants, and 34 teams made valid submissions. They gauge the state-of-the-art in efficient single-image super-resolution. To facilitate the reproducibility of the challenge and enable other researchers to build upon these findings, the code and the pre-trained model of validated solutions are made publicly available at https://github.com/Amazingren/NTIRE2024_ESR/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.10343v2-abstract-full').style.display = 'none'; document.getElementById('2404.10343v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The report paper of NTIRE2024 Efficient Super-resolution, accepted by CVPRW2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.08857">arXiv:2404.08857</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2404.08857">pdf</a>, <a href="https://arxiv.org/format/2404.08857">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Voice Attribute Editing with Text Prompt </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Sheng%2C+Z">Zhengyan Sheng</a>, <a href="/search/eess?searchtype=author&amp;query=Ai%2C+Y">Yang Ai</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+L">Li-Juan Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Pan%2C+J">Jia Pan</a>, <a href="/search/eess?searchtype=author&amp;query=Ling%2C+Z">Zhen-Hua Ling</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.08857v2-abstract-short" style="display: inline;"> Despite recent advancements in speech generation with text prompt providing control over speech style, voice attributes in synthesized speech remain elusive and challenging to control. This paper introduces a novel task: voice attribute editing with text prompt, with the goal of making relative modifications to voice attributes according to the actions described in the text prompt. To solve this t&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.08857v2-abstract-full').style.display = 'inline'; document.getElementById('2404.08857v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.08857v2-abstract-full" style="display: none;"> Despite recent advancements in speech generation with text prompt providing control over speech style, voice attributes in synthesized speech remain elusive and challenging to control. This paper introduces a novel task: voice attribute editing with text prompt, with the goal of making relative modifications to voice attributes according to the actions described in the text prompt. To solve this task, VoxEditor, an end-to-end generative model, is proposed. In VoxEditor, addressing the insufficiency of text prompt, a Residual Memory (ResMem) block is designed, that efficiently maps voice attributes and these descriptors into the shared feature space. Additionally, the ResMem block is enhanced with a voice attribute degree prediction (VADP) block to align voice attributes with corresponding descriptors, addressing the imprecision of text prompt caused by non-quantitative descriptions of voice attributes. We also establish the open-source VCTK-RVA dataset, which leads the way in manual annotations detailing voice characteristic differences among different speakers. Extensive experiments demonstrate the effectiveness and generalizability of our proposed method in terms of both objective and subjective metrics. The dataset and audio samples are available on the website. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.08857v2-abstract-full').style.display = 'none'; document.getElementById('2404.08857v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.01611">arXiv:2404.01611</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2404.01611">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Audio Simulation for Sound Source Localization in Virtual Evironment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Di+Yuan%2C+Y">Yi Di Yuan</a>, <a href="/search/eess?searchtype=author&amp;query=Wong%2C+S+L">Swee Liang Wong</a>, <a href="/search/eess?searchtype=author&amp;query=Pan%2C+J">Jonathan Pan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.01611v1-abstract-short" style="display: inline;"> Non-line-of-sight localization in signal-deprived environments is a challenging yet pertinent problem. Acoustic methods in such predominantly indoor scenarios encounter difficulty due to the reverberant nature. In this study, we aim to locate sound sources to specific locations within a virtual environment by leveraging physically grounded sound propagation simulations and machine learning methods&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.01611v1-abstract-full').style.display = 'inline'; document.getElementById('2404.01611v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.01611v1-abstract-full" style="display: none;"> Non-line-of-sight localization in signal-deprived environments is a challenging yet pertinent problem. Acoustic methods in such predominantly indoor scenarios encounter difficulty due to the reverberant nature. In this study, we aim to locate sound sources to specific locations within a virtual environment by leveraging physically grounded sound propagation simulations and machine learning methods. This process attempts to overcome the issue of data insufficiency to localize sound sources to their location of occurrence especially in post-event localization. We achieve 0.786+/- 0.0136 F1-score using an audio transformer spectrogram approach. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.01611v1-abstract-full').style.display = 'none'; document.getElementById('2404.01611v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">2024 IEEE World Forum on Public Safety Technology</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.00656">arXiv:2404.00656</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2404.00656">pdf</a>, <a href="https://arxiv.org/format/2404.00656">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> WavLLM: Towards Robust and Adaptive Speech Large Language Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Hu%2C+S">Shujie Hu</a>, <a href="/search/eess?searchtype=author&amp;query=Zhou%2C+L">Long Zhou</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+S">Shujie Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+S">Sanyuan Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Meng%2C+L">Lingwei Meng</a>, <a href="/search/eess?searchtype=author&amp;query=Hao%2C+H">Hongkun Hao</a>, <a href="/search/eess?searchtype=author&amp;query=Pan%2C+J">Jing Pan</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+X">Xunying Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+J">Jinyu Li</a>, <a href="/search/eess?searchtype=author&amp;query=Sivasankaran%2C+S">Sunit Sivasankaran</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+L">Linquan Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Wei%2C+F">Furu Wei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.00656v3-abstract-short" style="display: inline;"> The recent advancements in large language models (LLMs) have revolutionized the field of natural language processing, progressively broadening their scope to multimodal perception and generation. However, effectively integrating listening capabilities into LLMs poses significant challenges, particularly with respect to generalizing across varied contexts and executing complex auditory tasks. In th&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.00656v3-abstract-full').style.display = 'inline'; document.getElementById('2404.00656v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.00656v3-abstract-full" style="display: none;"> The recent advancements in large language models (LLMs) have revolutionized the field of natural language processing, progressively broadening their scope to multimodal perception and generation. However, effectively integrating listening capabilities into LLMs poses significant challenges, particularly with respect to generalizing across varied contexts and executing complex auditory tasks. In this work, we introduce WavLLM, a robust and adaptive speech large language model with dual encoders, and a prompt-aware LoRA weight adapter, optimized by a two-stage curriculum learning approach. Leveraging dual encoders, we decouple different types of speech information, utilizing a Whisper encoder to process the semantic content of speech, and a WavLM encoder to capture the unique characteristics of the speaker&#39;s identity. Within the curriculum learning framework, WavLLM first builds its foundational capabilities by optimizing on mixed elementary single tasks, followed by advanced multi-task training on more complex tasks such as combinations of the elementary tasks. To enhance the flexibility and adherence to different tasks and instructions, a prompt-aware LoRA weight adapter is introduced in the second advanced multi-task training stage. We validate the proposed model on universal speech benchmarks including tasks such as ASR, ST, SV, ER, and also apply it to specialized datasets like Gaokao English listening comprehension set for SQA, and speech Chain-of-Thought (CoT) evaluation set. Experiments demonstrate that the proposed model achieves state-of-the-art performance across a range of speech tasks on the same model size, exhibiting robust generalization capabilities in executing complex tasks using CoT approach. Furthermore, our model successfully completes Gaokao tasks without specialized training. The codes, models, audio, and Gaokao evaluation set can be accessed at \url{aka.ms/wavllm}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.00656v3-abstract-full').style.display = 'none'; document.getElementById('2404.00656v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted by EMNLP2024 findings</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.12173">arXiv:2401.12173</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2401.12173">pdf</a>, <a href="https://arxiv.org/format/2401.12173">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Waveform-Domain Complementary Signal Sets for Interrupted Sampling Repeater Jamming Suppression </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Su%2C+H">Hanning Su</a>, <a href="/search/eess?searchtype=author&amp;query=Bao%2C+Q">Qinglong Bao</a>, <a href="/search/eess?searchtype=author&amp;query=Pan%2C+J">Jiameng Pan</a>, <a href="/search/eess?searchtype=author&amp;query=Guo%2C+F">Fucheng Guo</a>, <a href="/search/eess?searchtype=author&amp;query=Hu%2C+W">Weidong Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.12173v1-abstract-short" style="display: inline;"> The interrupted-sampling repeater jamming (ISRJ) is coherent and has the characteristic of suppression and deception to degrade the radar detection capabilities. The study focuses on anti-ISRJ techniques in the waveform domain, primarily capitalizing on waveform design and and anti-jamming signal processing methods in the waveform domain. By exploring the relationship between waveform-domain adapt&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.12173v1-abstract-full').style.display = 'inline'; document.getElementById('2401.12173v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.12173v1-abstract-full" style="display: none;"> The interrupted-sampling repeater jamming (ISRJ) is coherent and has the characteristic of suppression and deception to degrade the radar detection capabilities. The study focuses on anti-ISRJ techniques in the waveform domain, primarily capitalizing on waveform design and and anti-jamming signal processing methods in the waveform domain. By exploring the relationship between waveform-domain adaptive matched filtering (WD-AMF) output and waveform-domain signals, we demonstrate that ISRJ can be effectively suppressed when the transmitted waveform exhibits waveform-domain complementarity. We introduce a phase-coded (PC) waveform set with waveform-domain complementarity and propose a method for generating such waveform sets of arbitrary code lengths. The performance of WD-AMF are further developed due to the designed waveforms, and simulations affirm the superior adaptive anti-jamming capabilities of the designed waveforms compared to traditional ones. Remarkably, this improved performance is achieved without the need for prior knowledge of ISRJ interference parameters at either the transmitter or receiver stages. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.12173v1-abstract-full').style.display = 'none'; document.getElementById('2401.12173v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.15309">arXiv:2311.15309</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2311.15309">pdf</a>, <a href="https://arxiv.org/format/2311.15309">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Deep Refinement-Based Joint Source Channel Coding over Time-Varying Channels </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Pan%2C+J">Junyu Pan</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+H">Hanlei Li</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+G">Guangyi Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Cai%2C+Y">Yunlong Cai</a>, <a href="/search/eess?searchtype=author&amp;query=Yu%2C+G">Guanding Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.15309v1-abstract-short" style="display: inline;"> In recent developments, deep learning (DL)-based joint source-channel coding (JSCC) for wireless image transmission has made significant strides in performance enhancement. Nonetheless, the majority of existing DL-based JSCC methods are tailored for scenarios featuring stable channel conditions, notably a fixed signal-to-noise ratio (SNR). This specialization poses a limitation, as their performan&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.15309v1-abstract-full').style.display = 'inline'; document.getElementById('2311.15309v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.15309v1-abstract-full" style="display: none;"> In recent developments, deep learning (DL)-based joint source-channel coding (JSCC) for wireless image transmission has made significant strides in performance enhancement. Nonetheless, the majority of existing DL-based JSCC methods are tailored for scenarios featuring stable channel conditions, notably a fixed signal-to-noise ratio (SNR). This specialization poses a limitation, as their performance tends to wane in practical scenarios marked by highly dynamic channels, given that a fixed SNR inadequately represents the dynamic nature of such channels. In response to this challenge, we introduce a novel solution, namely deep refinement-based JSCC (DRJSCC). This innovative method is designed to seamlessly adapt to channels exhibiting temporal variations. By leveraging instantaneous channel state information (CSI), we dynamically optimize the encoding strategy through re-encoding the channel symbols. This dynamic adjustment ensures that the encoding strategy consistently aligns with the varying channel conditions during the transmission process. Specifically, our approach begins with the division of encoded symbols into multiple blocks, which are transmitted progressively to the receiver. In the event of changing channel conditions, we propose a mechanism to re-encode the remaining blocks, allowing them to adapt to the current channel conditions. Experimental results show that the DRJSCC scheme achieves comparable performance to the other mainstream DL-based JSCC models in stable channel conditions, and also exhibits great robustness against time-varying channels. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.15309v1-abstract-full').style.display = 'none'; document.getElementById('2311.15309v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.02248">arXiv:2311.02248</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2311.02248">pdf</a>, <a href="https://arxiv.org/format/2311.02248">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> COSMIC: Data Efficient Instruction-tuning For Speech In-Context Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Pan%2C+J">Jing Pan</a>, <a href="/search/eess?searchtype=author&amp;query=Wu%2C+J">Jian Wu</a>, <a href="/search/eess?searchtype=author&amp;query=Gaur%2C+Y">Yashesh Gaur</a>, <a href="/search/eess?searchtype=author&amp;query=Sivasankaran%2C+S">Sunit Sivasankaran</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+Z">Zhuo Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+S">Shujie Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+J">Jinyu Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.02248v2-abstract-short" style="display: inline;"> We present a cost-effective method to integrate speech into a large language model (LLM), resulting in a Contextual Speech Model with Instruction-following/in-context-learning Capabilities (COSMIC) multi-modal LLM. Using GPT-3.5, we generate Speech Comprehension Test Question-Answer (SQA) pairs from speech transcriptions for supervised instruction tuning. With under 30 million trainable parameters&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.02248v2-abstract-full').style.display = 'inline'; document.getElementById('2311.02248v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.02248v2-abstract-full" style="display: none;"> We present a cost-effective method to integrate speech into a large language model (LLM), resulting in a Contextual Speech Model with Instruction-following/in-context-learning Capabilities (COSMIC) multi-modal LLM. Using GPT-3.5, we generate Speech Comprehension Test Question-Answer (SQA) pairs from speech transcriptions for supervised instruction tuning. With under 30 million trainable parameters and only 450 hours of English speech data, COSMIC demonstrates emerging capabilities in instruction-following and in-context learning. Equipped with such capabilities, COSMIC achieves a maximum 33.18 BLEU score in 0-shot EN-to-X speech to text translation (S2TT) and a significant boost in the 1-shot setting. Additionally, there is an average 25.8\% relative Word Error Rate (WER) reduction for 1-shot cross-domain adaptation. COSMIC exhibits a significant automatic speech recognition (ASR) accuracy gain in contextual biasing tasks due to its instruction-following capability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.02248v2-abstract-full').style.display = 'none'; document.getElementById('2311.02248v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.16206">arXiv:2309.16206</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2309.16206">pdf</a>, <a href="https://arxiv.org/format/2309.16206">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Alzheimer&#39;s Disease Prediction via Brain Structural-Functional Deep Fusing Network </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Zuo%2C+Q">Qiankun Zuo</a>, <a href="/search/eess?searchtype=author&amp;query=Pan%2C+J">Junren Pan</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+S">Shuqiang Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.16206v2-abstract-short" style="display: inline;"> Fusing structural-functional images of the brain has shown great potential to analyze the deterioration of Alzheimer&#39;s disease (AD). However, it is a big challenge to effectively fuse the correlated and complementary information from multimodal neuroimages. In this paper, a novel model termed cross-modal transformer generative adversarial network (CT-GAN) is proposed to effectively fuse the functi&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.16206v2-abstract-full').style.display = 'inline'; document.getElementById('2309.16206v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.16206v2-abstract-full" style="display: none;"> Fusing structural-functional images of the brain has shown great potential to analyze the deterioration of Alzheimer&#39;s disease (AD). However, it is a big challenge to effectively fuse the correlated and complementary information from multimodal neuroimages. In this paper, a novel model termed cross-modal transformer generative adversarial network (CT-GAN) is proposed to effectively fuse the functional and structural information contained in functional magnetic resonance imaging (fMRI) and diffusion tensor imaging (DTI). The CT-GAN can learn topological features and generate multimodal connectivity from multimodal imaging data in an efficient end-to-end manner. Moreover, the swapping bi-attention mechanism is designed to gradually align common features and effectively enhance the complementary features between modalities. By analyzing the generated connectivity features, the proposed model can identify AD-related brain connections. Evaluations on the public ADNI dataset show that the proposed CT-GAN can dramatically improve prediction performance and detect AD-related brain regions effectively. The proposed model also provides new insights for detecting AD-related abnormal neural circuits. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.16206v2-abstract-full').style.display = 'none'; document.getElementById('2309.16206v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.14087">arXiv:2309.14087</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2309.14087">pdf</a>, <a href="https://arxiv.org/format/2309.14087">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Adaptive Three Layer Hybrid Reconfigurable Intelligent Surface for 6G Wireless Communication: Trade-offs and Performance </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Ratul%2C+R+H">Rashed Hasan Ratul</a>, <a href="/search/eess?searchtype=author&amp;query=Iqbal%2C+M">Muhammad Iqbal</a>, <a href="/search/eess?searchtype=author&amp;query=Ashraf%2C+T">Tabinda Ashraf</a>, <a href="/search/eess?searchtype=author&amp;query=Pan%2C+J">Jen-Yi Pan</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+Y">Yi-Han Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Lien%2C+S">Shao-Yu Lien</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.14087v1-abstract-short" style="display: inline;"> A potential candidate technology for the development of future 6G networks has been recognized as Reconfigurable Intelligent Surface (RIS). However, due to the variation in radio link quality, traditional passive RISs only accomplish a minimal signal gain in situations with strong direct links between user equipment (UE) and base station (BS). In order to get over this fundamental restriction of s&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.14087v1-abstract-full').style.display = 'inline'; document.getElementById('2309.14087v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.14087v1-abstract-full" style="display: none;"> A potential candidate technology for the development of future 6G networks has been recognized as Reconfigurable Intelligent Surface (RIS). However, due to the variation in radio link quality, traditional passive RISs only accomplish a minimal signal gain in situations with strong direct links between user equipment (UE) and base station (BS). In order to get over this fundamental restriction of smaller gain, the idea of active RISs might be a suitable solution. In contrast to current passive RIS, which simply reflects and directs signals without any additional amplification, active RISs have the ability to enhance reflected signals by the incorporation of amplifiers inside its elements. However, with additional amplifiers, apart from the relatively complex attributes of RIS-assisted arrangements, the additional energy consumption of such technologies is often disregarded. So, there might be a tradeoff between the additional energy consumption for the RIS technologies and the overall gain acquired by deploying this potential advancement. The objective of this work is to provide a primary idea of a three-layer hybrid RIS-assisted configuration that is responsive to both active and passive RIS, as well as an additional dormant or inactive state. The single RIS structure should be capable of adjusting its overall configuration in response to fluctuations in transmit power and radio link quality. Furthermore, our fabricated passive RIS-assisted structure verifies a portion of the proposed idea, with simulations highlighting its advantages over standalone passive or active RIS-assisted technologies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.14087v1-abstract-full').style.display = 'none'; document.getElementById('2309.14087v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted for presentation and publication at the 8th IEEE Asia Pacific Conference on Wireless and Mobile (APWiMob) Conference</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.10832">arXiv:2309.10832</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2309.10832">pdf</a>, <a href="https://arxiv.org/ps/2309.10832">ps</a>, <a href="https://arxiv.org/format/2309.10832">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Efficient Multi-Channel Speech Enhancement with Spherical Harmonics Injection for Directional Encoding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Pan%2C+J">Jiahui Pan</a>, <a href="/search/eess?searchtype=author&amp;query=Shen%2C+P">Pengjie Shen</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+H">Hui Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+X">Xueliang Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.10832v1-abstract-short" style="display: inline;"> Multi-channel speech enhancement extracts speech using multiple microphones that capture spatial cues. Effectively utilizing directional information is key for multi-channel enhancement. Deep learning shows great potential on multi-channel speech enhancement and often takes short-time Fourier Transform (STFT) as inputs directly. To fully leverage the spatial information, we introduce a method usin&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.10832v1-abstract-full').style.display = 'inline'; document.getElementById('2309.10832v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.10832v1-abstract-full" style="display: none;"> Multi-channel speech enhancement extracts speech using multiple microphones that capture spatial cues. Effectively utilizing directional information is key for multi-channel enhancement. Deep learning shows great potential on multi-channel speech enhancement and often takes short-time Fourier Transform (STFT) as inputs directly. To fully leverage the spatial information, we introduce a method using spherical harmonics transform (SHT) coefficients as auxiliary model inputs. These coefficients concisely represent spatial distributions. Specifically, our model has two encoders, one for the STFT and another for the SHT. By fusing both encoders in the decoder to estimate the enhanced STFT, we effectively incorporate spatial context. Evaluations on TIMIT under varying noise and reverberation show our model outperforms established benchmarks. Remarkably, this is achieved with fewer computations and parameters. By leveraging spherical harmonics to incorporate directional cues, our model efficiently improves the performance of the multi-channel speech enhancement. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.10832v1-abstract-full').style.display = 'none'; document.getElementById('2309.10832v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">arXiv admin note: text overlap with arXiv:2309.10393</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.10393">arXiv:2309.10393</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2309.10393">pdf</a>, <a href="https://arxiv.org/ps/2309.10393">ps</a>, <a href="https://arxiv.org/format/2309.10393">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Hierarchical Modeling of Spatial Cues via Spherical Harmonics for Multi-Channel Speech Enhancement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Pan%2C+J">Jiahui Pan</a>, <a href="/search/eess?searchtype=author&amp;query=He%2C+S">Shulin He</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+H">Hui Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+X">Xueliang Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.10393v1-abstract-short" style="display: inline;"> Multi-channel speech enhancement utilizes spatial information from multiple microphones to extract the target speech. However, most existing methods do not explicitly model spatial cues, instead relying on implicit learning from multi-channel spectra. To better leverage spatial information, we propose explicitly incorporating spatial modeling by applying spherical harmonic transforms (SHT) to the&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.10393v1-abstract-full').style.display = 'inline'; document.getElementById('2309.10393v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.10393v1-abstract-full" style="display: none;"> Multi-channel speech enhancement utilizes spatial information from multiple microphones to extract the target speech. However, most existing methods do not explicitly model spatial cues, instead relying on implicit learning from multi-channel spectra. To better leverage spatial information, we propose explicitly incorporating spatial modeling by applying spherical harmonic transforms (SHT) to the multi-channel input. In detail, a hierarchical framework is introduced whereby lower order harmonics capturing broader spatial patterns are estimated first, then combined with higher orders to recursively predict finer spatial details. Experiments on TIMIT demonstrate the proposed method can effectively recover target spatial patterns and achieve improved performance over baseline models, using fewer parameters and computations. Explicitly modeling spatial information hierarchically enables more effective multi-channel speech enhancement. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.10393v1-abstract-full').style.display = 'none'; document.getElementById('2309.10393v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.10379">arXiv:2309.10379</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2309.10379">pdf</a>, <a href="https://arxiv.org/ps/2309.10379">ps</a>, <a href="https://arxiv.org/format/2309.10379">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> PDPCRN: Parallel Dual-Path CRN with Bi-directional Inter-Branch Interactions for Multi-Channel Speech Enhancement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Pan%2C+J">Jiahui Pan</a>, <a href="/search/eess?searchtype=author&amp;query=He%2C+S">Shulin He</a>, <a href="/search/eess?searchtype=author&amp;query=Wu%2C+T">Tianci Wu</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+H">Hui Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+X">Xueliang Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.10379v1-abstract-short" style="display: inline;"> Multi-channel speech enhancement seeks to utilize spatial information to distinguish target speech from interfering signals. While deep learning approaches like the dual-path convolutional recurrent network (DPCRN) have made strides, challenges persist in effectively modeling inter-channel correlations and amalgamating multi-level information. In response, we introduce the Parallel Dual-Path Convo&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.10379v1-abstract-full').style.display = 'inline'; document.getElementById('2309.10379v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.10379v1-abstract-full" style="display: none;"> Multi-channel speech enhancement seeks to utilize spatial information to distinguish target speech from interfering signals. While deep learning approaches like the dual-path convolutional recurrent network (DPCRN) have made strides, challenges persist in effectively modeling inter-channel correlations and amalgamating multi-level information. In response, we introduce the Parallel Dual-Path Convolutional Recurrent Network (PDPCRN). This acoustic modeling architecture has two key innovations. First, a parallel design with separate branches extracts complementary features. Second, bi-directional modules enable cross-branch communication. Together, these facilitate diverse representation fusion and enhanced modeling. Experimental validation on TIMIT datasets underscores the prowess of PDPCRN. Notably, against baseline models like the standard DPCRN, PDPCRN not only outperforms in PESQ and STOI metrics but also boasts a leaner computational footprint with reduced parameters. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.10379v1-abstract-full').style.display = 'none'; document.getElementById('2309.10379v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.08643">arXiv:2309.08643</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2309.08643">pdf</a>, <a href="https://arxiv.org/format/2309.08643">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> NISF: Neural Implicit Segmentation Functions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Stolt-Ans%C3%B3%2C+N">Nil Stolt-Ans贸</a>, <a href="/search/eess?searchtype=author&amp;query=McGinnis%2C+J">Julian McGinnis</a>, <a href="/search/eess?searchtype=author&amp;query=Pan%2C+J">Jiazhen Pan</a>, <a href="/search/eess?searchtype=author&amp;query=Hammernik%2C+K">Kerstin Hammernik</a>, <a href="/search/eess?searchtype=author&amp;query=Rueckert%2C+D">Daniel Rueckert</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.08643v1-abstract-short" style="display: inline;"> Segmentation of anatomical shapes from medical images has taken an important role in the automation of clinical measurements. While typical deep-learning segmentation approaches are performed on discrete voxels, the underlying objects being analysed exist in a real-valued continuous space. Approaches that rely on convolutional neural networks (CNNs) are limited to grid-like inputs and not easily a&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.08643v1-abstract-full').style.display = 'inline'; document.getElementById('2309.08643v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.08643v1-abstract-full" style="display: none;"> Segmentation of anatomical shapes from medical images has taken an important role in the automation of clinical measurements. While typical deep-learning segmentation approaches are performed on discrete voxels, the underlying objects being analysed exist in a real-valued continuous space. Approaches that rely on convolutional neural networks (CNNs) are limited to grid-like inputs and not easily applicable to sparse or partial measurements. We propose a novel family of image segmentation models that tackle many of CNNs&#39; shortcomings: Neural Implicit Segmentation Functions (NISF). Our framework takes inspiration from the field of neural implicit functions where a network learns a mapping from a real-valued coordinate-space to a shape representation. NISFs have the ability to segment anatomical shapes in high-dimensional continuous spaces. Training is not limited to voxelized grids, and covers applications with sparse and partial data. Interpolation between observations is learnt naturally in the training procedure and requires no post-processing. Furthermore, NISFs allow the leveraging of learnt shape priors to make predictions for regions outside of the original image plane. We go on to show the framework achieves dice scores of 0.87 $\pm$ 0.045 on a (3D+t) short-axis cardiac segmentation task using the UK Biobank dataset. We also provide a qualitative analysis on our frameworks ability to perform segmentation and image interpolation on unseen regions of an image volume at arbitrary resolutions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.08643v1-abstract-full').style.display = 'none'; document.getElementById('2309.08643v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.08348">arXiv:2309.08348</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2309.08348">pdf</a>, <a href="https://arxiv.org/format/2309.08348">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> The Multimodal Information Based Speech Processing (MISP) 2023 Challenge: Audio-Visual Target Speaker Extraction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Wu%2C+S">Shilong Wu</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+C">Chenxi Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+H">Hang Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Dai%2C+Y">Yusheng Dai</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+C">Chenyue Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+R">Ruoyu Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Lan%2C+H">Hongbo Lan</a>, <a href="/search/eess?searchtype=author&amp;query=Du%2C+J">Jun Du</a>, <a href="/search/eess?searchtype=author&amp;query=Lee%2C+C">Chin-Hui Lee</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+J">Jingdong Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Watanabe%2C+S">Shinji Watanabe</a>, <a href="/search/eess?searchtype=author&amp;query=Siniscalchi%2C+S+M">Sabato Marco Siniscalchi</a>, <a href="/search/eess?searchtype=author&amp;query=Scharenborg%2C+O">Odette Scharenborg</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+Z">Zhong-Qiu Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Pan%2C+J">Jia Pan</a>, <a href="/search/eess?searchtype=author&amp;query=Gao%2C+J">Jianqing Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.08348v1-abstract-short" style="display: inline;"> Previous Multimodal Information based Speech Processing (MISP) challenges mainly focused on audio-visual speech recognition (AVSR) with commendable success. However, the most advanced back-end recognition systems often hit performance limits due to the complex acoustic environments. This has prompted a shift in focus towards the Audio-Visual Target Speaker Extraction (AVTSE) task for the MISP 2023&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.08348v1-abstract-full').style.display = 'inline'; document.getElementById('2309.08348v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.08348v1-abstract-full" style="display: none;"> Previous Multimodal Information based Speech Processing (MISP) challenges mainly focused on audio-visual speech recognition (AVSR) with commendable success. However, the most advanced back-end recognition systems often hit performance limits due to the complex acoustic environments. This has prompted a shift in focus towards the Audio-Visual Target Speaker Extraction (AVTSE) task for the MISP 2023 challenge in ICASSP 2024 Signal Processing Grand Challenges. Unlike existing audio-visual speech enhance-ment challenges primarily focused on simulation data, the MISP 2023 challenge uniquely explores how front-end speech processing, combined with visual clues, impacts back-end tasks in real-world scenarios. This pioneering effort aims to set the first benchmark for the AVTSE task, offering fresh insights into enhancing the ac-curacy of back-end speech recognition systems through AVTSE in challenging and real acoustic environments. This paper delivers a thorough overview of the task setting, dataset, and baseline system of the MISP 2023 challenge. It also includes an in-depth analysis of the challenges participants may encounter. The experimental results highlight the demanding nature of this task, and we look forward to the innovative solutions participants will bring forward. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.08348v1-abstract-full').style.display = 'none'; document.getElementById('2309.08348v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.01994">arXiv:2309.01994</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2309.01994">pdf</a>, <a href="https://arxiv.org/format/2309.01994">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Cloud Control of Connected Vehicle under Bi-directional Time-varying delay: An Application of Predictor-observer Structured Controller </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Pan%2C+J">Ji-An Pan</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+Q">Qing Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+K">Keqiang Li</a>, <a href="/search/eess?searchtype=author&amp;query=Yang%2C+C">Chunying Yang</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+J">Jianqiang Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.01994v2-abstract-short" style="display: inline;"> This article is devoted to addressing the cloud control of connected vehicles, specifically focusing on analyzing the effect of bi-directional communication-induced delays. To mitigate the adverse effects of such delays, a novel predictor-observer structured controller is proposed which compensate for both measurable output delays and unmeasurable, yet bounded, input delays simultaneously. The stu&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.01994v2-abstract-full').style.display = 'inline'; document.getElementById('2309.01994v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.01994v2-abstract-full" style="display: none;"> This article is devoted to addressing the cloud control of connected vehicles, specifically focusing on analyzing the effect of bi-directional communication-induced delays. To mitigate the adverse effects of such delays, a novel predictor-observer structured controller is proposed which compensate for both measurable output delays and unmeasurable, yet bounded, input delays simultaneously. The study begins by novelly constructing an equivalent delay-free inter-connected system model that incorporates the Predictor-Observer controller, considering certain delay boundaries and model uncertainties. Subsequently, a stability analysis is conducted to assess the system&#39;s robustness under these conditions. Next, the connected vehicle lateral control scenario is built which contain high-fidelity vehicle dynamic model. The results demonstrate the controller&#39;s ability to accurately predict the system states, even under time-varying bi-directional delays. Finally, the proposed method is deployed in a real connected vehicle lateral control system. Comparative tests with a conventional linear feedback controller showcase significantly improved control performance under dominant bi-directional delay conditions, affirming the superiority of the proposed method against the delay. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.01994v2-abstract-full').style.display = 'none'; document.getElementById('2309.01994v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.14638">arXiv:2308.14638</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2308.14638">pdf</a>, <a href="https://arxiv.org/format/2308.14638">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> The USTC-NERCSLIP Systems for the CHiME-7 DASR Challenge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Wang%2C+R">Ruoyu Wang</a>, <a href="/search/eess?searchtype=author&amp;query=He%2C+M">Maokui He</a>, <a href="/search/eess?searchtype=author&amp;query=Du%2C+J">Jun Du</a>, <a href="/search/eess?searchtype=author&amp;query=Zhou%2C+H">Hengshun Zhou</a>, <a href="/search/eess?searchtype=author&amp;query=Niu%2C+S">Shutong Niu</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+H">Hang Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Yue%2C+Y">Yanyan Yue</a>, <a href="/search/eess?searchtype=author&amp;query=Yang%2C+G">Gaobin Yang</a>, <a href="/search/eess?searchtype=author&amp;query=Wu%2C+S">Shilong Wu</a>, <a href="/search/eess?searchtype=author&amp;query=Sun%2C+L">Lei Sun</a>, <a href="/search/eess?searchtype=author&amp;query=Tu%2C+Y">Yanhui Tu</a>, <a href="/search/eess?searchtype=author&amp;query=Tang%2C+H">Haitao Tang</a>, <a href="/search/eess?searchtype=author&amp;query=Qian%2C+S">Shuangqing Qian</a>, <a href="/search/eess?searchtype=author&amp;query=Gao%2C+T">Tian Gao</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+M">Mengzhi Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Wan%2C+G">Genshun Wan</a>, <a href="/search/eess?searchtype=author&amp;query=Pan%2C+J">Jia Pan</a>, <a href="/search/eess?searchtype=author&amp;query=Gao%2C+J">Jianqing Gao</a>, <a href="/search/eess?searchtype=author&amp;query=Lee%2C+C">Chin-Hui Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.14638v2-abstract-short" style="display: inline;"> This technical report details our submission system to the CHiME-7 DASR Challenge, which focuses on speaker diarization and speech recognition under complex multi-speaker scenarios. Additionally, it also evaluates the efficiency of systems in handling diverse array devices. To address these issues, we implemented an end-to-end speaker diarization system and introduced a rectification strategy base&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.14638v2-abstract-full').style.display = 'inline'; document.getElementById('2308.14638v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.14638v2-abstract-full" style="display: none;"> This technical report details our submission system to the CHiME-7 DASR Challenge, which focuses on speaker diarization and speech recognition under complex multi-speaker scenarios. Additionally, it also evaluates the efficiency of systems in handling diverse array devices. To address these issues, we implemented an end-to-end speaker diarization system and introduced a rectification strategy based on multi-channel spatial information. This approach significantly diminished the word error rates (WER). In terms of recognition, we utilized publicly available pre-trained models as the foundational models to train our end-to-end speech recognition models. Our system attained a Macro-averaged diarization-attributed WER (DA-WER) of 21.01% on the CHiME-7 evaluation set, which signifies a relative improvement of 62.04% over the official baseline system. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.14638v2-abstract-full').style.display = 'none'; document.getElementById('2308.14638v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by 2023 CHiME Workshop, Oral</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.10488">arXiv:2308.10488</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2308.10488">pdf</a>, <a href="https://arxiv.org/format/2308.10488">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Medical Image Segmentation: Optimizing Cross-Entropy Weights and Post-Processing with Autoencoders </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Singh%2C+P">Pranav Singh</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+L">Luoyao Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+M">Mei Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Pan%2C+J">Jinqian Pan</a>, <a href="/search/eess?searchtype=author&amp;query=Chukkapalli%2C+R">Raviteja Chukkapalli</a>, <a href="/search/eess?searchtype=author&amp;query=Chaudhari%2C+S">Shravan Chaudhari</a>, <a href="/search/eess?searchtype=author&amp;query=Cirrone%2C+J">Jacopo Cirrone</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.10488v1-abstract-short" style="display: inline;"> The task of medical image segmentation presents unique challenges, necessitating both localized and holistic semantic understanding to accurately delineate areas of interest, such as critical tissues or aberrant features. This complexity is heightened in medical image segmentation due to the high degree of inter-class similarities, intra-class variations, and possible image obfuscation. The segmen&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.10488v1-abstract-full').style.display = 'inline'; document.getElementById('2308.10488v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.10488v1-abstract-full" style="display: none;"> The task of medical image segmentation presents unique challenges, necessitating both localized and holistic semantic understanding to accurately delineate areas of interest, such as critical tissues or aberrant features. This complexity is heightened in medical image segmentation due to the high degree of inter-class similarities, intra-class variations, and possible image obfuscation. The segmentation task further diversifies when considering the study of histopathology slides for autoimmune diseases like dermatomyositis. The analysis of cell inflammation and interaction in these cases has been less studied due to constraints in data acquisition pipelines. Despite the progressive strides in medical science, we lack a comprehensive collection of autoimmune diseases. As autoimmune diseases globally escalate in prevalence and exhibit associations with COVID-19, their study becomes increasingly essential. While there is existing research that integrates artificial intelligence in the analysis of various autoimmune diseases, the exploration of dermatomyositis remains relatively underrepresented. In this paper, we present a deep-learning approach tailored for Medical image segmentation. Our proposed method outperforms the current state-of-the-art techniques by an average of 12.26% for U-Net and 12.04% for U-Net++ across the ResNet family of encoders on the dermatomyositis dataset. Furthermore, we probe the importance of optimizing loss function weights and benchmark our methodology on three challenging medical image segmentation tasks <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.10488v1-abstract-full').style.display = 'none'; document.getElementById('2308.10488v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at ICCV CVAMD 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2307.12672">arXiv:2307.12672</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2307.12672">pdf</a>, <a href="https://arxiv.org/format/2307.12672">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Global k-Space Interpolation for Dynamic MRI Reconstruction using Masked Image Modeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Pan%2C+J">Jiazhen Pan</a>, <a href="/search/eess?searchtype=author&amp;query=Shit%2C+S">Suprosanna Shit</a>, <a href="/search/eess?searchtype=author&amp;query=Turgut%2C+%C3%96">脰zg眉n Turgut</a>, <a href="/search/eess?searchtype=author&amp;query=Huang%2C+W">Wenqi Huang</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+H+B">Hongwei Bran Li</a>, <a href="/search/eess?searchtype=author&amp;query=Stolt-Ans%C3%B3%2C+N">Nil Stolt-Ans贸</a>, <a href="/search/eess?searchtype=author&amp;query=K%C3%BCstner%2C+T">Thomas K眉stner</a>, <a href="/search/eess?searchtype=author&amp;query=Hammernik%2C+K">Kerstin Hammernik</a>, <a href="/search/eess?searchtype=author&amp;query=Rueckert%2C+D">Daniel Rueckert</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2307.12672v2-abstract-short" style="display: inline;"> In dynamic Magnetic Resonance Imaging (MRI), k-space is typically undersampled due to limited scan time, resulting in aliasing artifacts in the image domain. Hence, dynamic MR reconstruction requires not only modeling spatial frequency components in the x and y directions of k-space but also considering temporal redundancy. Most previous works rely on image-domain regularizers (priors) to conduct&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.12672v2-abstract-full').style.display = 'inline'; document.getElementById('2307.12672v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2307.12672v2-abstract-full" style="display: none;"> In dynamic Magnetic Resonance Imaging (MRI), k-space is typically undersampled due to limited scan time, resulting in aliasing artifacts in the image domain. Hence, dynamic MR reconstruction requires not only modeling spatial frequency components in the x and y directions of k-space but also considering temporal redundancy. Most previous works rely on image-domain regularizers (priors) to conduct MR reconstruction. In contrast, we focus on interpolating the undersampled k-space before obtaining images with Fourier transform. In this work, we connect masked image modeling with k-space interpolation and propose a novel Transformer-based k-space Global Interpolation Network, termed k-GIN. Our k-GIN learns global dependencies among low- and high-frequency components of 2D+t k-space and uses it to interpolate unsampled data. Further, we propose a novel k-space Iterative Refinement Module (k-IRM) to enhance the high-frequency components learning. We evaluate our approach on 92 in-house 2D+t cardiac MR subjects and compare it to MR reconstruction methods with image-domain regularizers. Experiments show that our proposed k-space interpolation method quantitatively and qualitatively outperforms baseline methods. Importantly, the proposed approach achieves substantially higher robustness and generalizability in cases of highly-undersampled MR data. For video presentation, poster, GIF results and code please check our project page: https://jzpeterpan.github.io/k-gin.github.io/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.12672v2-abstract-full').style.display = 'none'; document.getElementById('2307.12672v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2307.12397">arXiv:2307.12397</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2307.12397">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Performance Comparison Between VoLTE and non-VoLTE Voice Calls During Mobility in Commercial Deployment: A Drive Test-Based Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Ratul%2C+R+H">Rashed Hasan Ratul</a>, <a href="/search/eess?searchtype=author&amp;query=Iqbal%2C+M">Muhammad Iqbal</a>, <a href="/search/eess?searchtype=author&amp;query=Pan%2C+J">Jen-Yi Pan</a>, <a href="/search/eess?searchtype=author&amp;query=Deen%2C+M+M+A">Mohammad Mahadi Al Deen</a>, <a href="/search/eess?searchtype=author&amp;query=Kawser%2C+M+T">Mohammad Tawhid Kawser</a>, <a href="/search/eess?searchtype=author&amp;query=Billah%2C+M+M">Mohammad Masum Billah</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2307.12397v1-abstract-short" style="display: inline;"> The optimization of network performance is vital for the delivery of services using standard cellular technologies for mobile communications. Call setup delay and User Equipment (UE) battery savings significantly influence network performance. Improving these factors is vital for ensuring optimal service delivery. In comparison to traditional circuit-switched voice calls, VoLTE (Voice over LTE) te&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.12397v1-abstract-full').style.display = 'inline'; document.getElementById('2307.12397v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2307.12397v1-abstract-full" style="display: none;"> The optimization of network performance is vital for the delivery of services using standard cellular technologies for mobile communications. Call setup delay and User Equipment (UE) battery savings significantly influence network performance. Improving these factors is vital for ensuring optimal service delivery. In comparison to traditional circuit-switched voice calls, VoLTE (Voice over LTE) technology offers faster call setup durations and better battery-saving performance. To validate these claims, a drive test was carried out using the XCAL drive test tool to collect real-time network parameter details in VoLTE and non-VoLTE voice calls. The findings highlight the analysis of real-time network characteristics, such as the call setup delay calculation, battery-saving performance, and DRX mechanism. The study contributes to the understanding of network optimization strategies and provides insights for enhancing the quality of service (QoS) in mobile communication networks. Examining VoLTE and non-VoLTE operations, this research highlights the substantial energy savings obtained by VoLTE. Specifically, VoLTE saves approximately 60.76% of energy before the Service Request and approximately 38.97% of energy after the Service Request. Moreover, VoLTE to VoLTE calls have a 72.6% faster call setup delay than non-VoLTE-based LTE to LTE calls, because of fewer signaling messages required. Furthermore, as compared to non-VoLTE to non-VoLTE calls, VoLTE to non-VoLTE calls offer an 18.6% faster call setup delay. These results showcase the performance advantages of VoLTE and reinforce its potential for offering better services in wireless communication networks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.12397v1-abstract-full').style.display = 'none'; document.getElementById('2307.12397v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted for presentation and Publication on the IEEE 10th International Conference on Electrical Engineering, Computer Science and Informatics (EECSI 2023)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2307.03368">arXiv:2307.03368</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2307.03368">pdf</a>, <a href="https://arxiv.org/format/2307.03368">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Waveform-Domain Adaptive Matched Filtering for Suppressing Interrupted-Sampling Repeater Jamming </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Su%2C+H">Hanning Su</a>, <a href="/search/eess?searchtype=author&amp;query=Bao%2C+Q">Qinglong Bao</a>, <a href="/search/eess?searchtype=author&amp;query=Pan%2C+J">Jiameng Pan</a>, <a href="/search/eess?searchtype=author&amp;query=Guo%2C+F">Fucheng Guo</a>, <a href="/search/eess?searchtype=author&amp;query=Hu%2C+W">Weidong Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2307.03368v2-abstract-short" style="display: inline;"> The inadequate adaptability to flexible interference scenarios remains an unresolved challenge in the majority of techniques utilized for mitigating interrupted-sampling repeater jamming (ISRJ). Matched filtering system based methods is desirable to incorporate anti-ISRJ measures based on prior ISRJ modeling, either preceding or succeeding the matched filtering. Due to the partial matching nature&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.03368v2-abstract-full').style.display = 'inline'; document.getElementById('2307.03368v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2307.03368v2-abstract-full" style="display: none;"> The inadequate adaptability to flexible interference scenarios remains an unresolved challenge in the majority of techniques utilized for mitigating interrupted-sampling repeater jamming (ISRJ). Matched filtering system based methods is desirable to incorporate anti-ISRJ measures based on prior ISRJ modeling, either preceding or succeeding the matched filtering. Due to the partial matching nature of ISRJ, its characteristics are revealed during the process of matched filtering. Therefore, this paper introduces an extended domain called the waveform domain within the matched filtering process. On this domain, an adaptive matched filtering model, known as the waveform-domain adaptive matched filtering (WD-AMF), is established to tackle the problem of ISRJ suppression without relying on a pre-existing ISRJ model. The output of the WD-AMF encompasses an adaptive filtering term and a compensation term. The adaptive filtering term encompasses the adaptive integration outcomes in the waveform domain, which are determined by an adaptive weighted function. This function, akin to a collection of bandpass filters, decomposes the integrated function into multiple components, some of which contain interference while others do not. The compensation term adheres to an integrated guideline for discerning the presence of signal components or noise within the integrated function. The integration results are then concatenated to reconstruct a compensated matched filter signal output. Simulations are conducted to showcase the exceptional capability of the proposed method in suppressing ISRJ in diverse interference scenarios, even in the absence of a pre-existing ISRJ model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.03368v2-abstract-full').style.display = 'none'; document.getElementById('2307.03368v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.17103">arXiv:2306.17103</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2306.17103">pdf</a>, <a href="https://arxiv.org/format/2306.17103">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> LyricWhiz: Robust Multilingual Zero-shot Lyrics Transcription by Whispering to ChatGPT </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Zhuo%2C+L">Le Zhuo</a>, <a href="/search/eess?searchtype=author&amp;query=Yuan%2C+R">Ruibin Yuan</a>, <a href="/search/eess?searchtype=author&amp;query=Pan%2C+J">Jiahao Pan</a>, <a href="/search/eess?searchtype=author&amp;query=Ma%2C+Y">Yinghao Ma</a>, <a href="/search/eess?searchtype=author&amp;query=LI%2C+Y">Yizhi LI</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+G">Ge Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+S">Si Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Dannenberg%2C+R">Roger Dannenberg</a>, <a href="/search/eess?searchtype=author&amp;query=Fu%2C+J">Jie Fu</a>, <a href="/search/eess?searchtype=author&amp;query=Lin%2C+C">Chenghua Lin</a>, <a href="/search/eess?searchtype=author&amp;query=Benetos%2C+E">Emmanouil Benetos</a>, <a href="/search/eess?searchtype=author&amp;query=Xue%2C+W">Wei Xue</a>, <a href="/search/eess?searchtype=author&amp;query=Guo%2C+Y">Yike Guo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.17103v4-abstract-short" style="display: inline;"> We introduce LyricWhiz, a robust, multilingual, and zero-shot automatic lyrics transcription method achieving state-of-the-art performance on various lyrics transcription datasets, even in challenging genres such as rock and metal. Our novel, training-free approach utilizes Whisper, a weakly supervised robust speech recognition model, and GPT-4, today&#39;s most performant chat-based large language mo&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.17103v4-abstract-full').style.display = 'inline'; document.getElementById('2306.17103v4-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.17103v4-abstract-full" style="display: none;"> We introduce LyricWhiz, a robust, multilingual, and zero-shot automatic lyrics transcription method achieving state-of-the-art performance on various lyrics transcription datasets, even in challenging genres such as rock and metal. Our novel, training-free approach utilizes Whisper, a weakly supervised robust speech recognition model, and GPT-4, today&#39;s most performant chat-based large language model. In the proposed method, Whisper functions as the &#34;ear&#34; by transcribing the audio, while GPT-4 serves as the &#34;brain,&#34; acting as an annotator with a strong performance for contextualized output selection and correction. Our experiments show that LyricWhiz significantly reduces Word Error Rate compared to existing methods in English and can effectively transcribe lyrics across multiple languages. Furthermore, we use LyricWhiz to create the first publicly available, large-scale, multilingual lyrics transcription dataset with a CC-BY-NC-SA copyright license, based on MTG-Jamendo, and offer a human-annotated subset for noise level estimation and evaluation. We anticipate that our proposed method and dataset will advance the development of multilingual lyrics transcription, a challenging and emerging task. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.17103v4-abstract-full').style.display = 'none'; document.getElementById('2306.17103v4-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, 2 figures, 5 tables, accepted by ISMIR 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2303.15065">arXiv:2303.15065</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2303.15065">pdf</a>, <a href="https://arxiv.org/format/2303.15065">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Single-subject Multi-contrast MRI Super-resolution via Implicit Neural Representations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=McGinnis%2C+J">Julian McGinnis</a>, <a href="/search/eess?searchtype=author&amp;query=Shit%2C+S">Suprosanna Shit</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+H+B">Hongwei Bran Li</a>, <a href="/search/eess?searchtype=author&amp;query=Sideri-Lampretsa%2C+V">Vasiliki Sideri-Lampretsa</a>, <a href="/search/eess?searchtype=author&amp;query=Graf%2C+R">Robert Graf</a>, <a href="/search/eess?searchtype=author&amp;query=Dannecker%2C+M">Maik Dannecker</a>, <a href="/search/eess?searchtype=author&amp;query=Pan%2C+J">Jiazhen Pan</a>, <a href="/search/eess?searchtype=author&amp;query=Ans%C3%B3%2C+N+S">Nil Stolt Ans贸</a>, <a href="/search/eess?searchtype=author&amp;query=M%C3%BChlau%2C+M">Mark M眉hlau</a>, <a href="/search/eess?searchtype=author&amp;query=Kirschke%2C+J+S">Jan S. Kirschke</a>, <a href="/search/eess?searchtype=author&amp;query=Rueckert%2C+D">Daniel Rueckert</a>, <a href="/search/eess?searchtype=author&amp;query=Wiestler%2C+B">Benedikt Wiestler</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2303.15065v3-abstract-short" style="display: inline;"> Clinical routine and retrospective cohorts commonly include multi-parametric Magnetic Resonance Imaging; however, they are mostly acquired in different anisotropic 2D views due to signal-to-noise-ratio and scan-time constraints. Thus acquired views suffer from poor out-of-plane resolution and affect downstream volumetric image analysis that typically requires isotropic 3D scans. Combining differen&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.15065v3-abstract-full').style.display = 'inline'; document.getElementById('2303.15065v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2303.15065v3-abstract-full" style="display: none;"> Clinical routine and retrospective cohorts commonly include multi-parametric Magnetic Resonance Imaging; however, they are mostly acquired in different anisotropic 2D views due to signal-to-noise-ratio and scan-time constraints. Thus acquired views suffer from poor out-of-plane resolution and affect downstream volumetric image analysis that typically requires isotropic 3D scans. Combining different views of multi-contrast scans into high-resolution isotropic 3D scans is challenging due to the lack of a large training cohort, which calls for a subject-specific framework. This work proposes a novel solution to this problem leveraging Implicit Neural Representations (INR). Our proposed INR jointly learns two different contrasts of complementary views in a continuous spatial function and benefits from exchanging anatomical information between them. Trained within minutes on a single commodity GPU, our model provides realistic super-resolution across different pairs of contrasts in our experiments with three datasets. Using Mutual Information (MI) as a metric, we find that our model converges to an optimum MI amongst sequences, achieving anatomically faithful reconstruction. Code is available at: https://github.com/jqmcginnis/multi_contrast_inr/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.15065v3-abstract-full').style.display = 'none'; document.getElementById('2303.15065v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2302.02504">arXiv:2302.02504</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2302.02504">pdf</a>, <a href="https://arxiv.org/format/2302.02504">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Motion-compensated MR CINE reconstruction with reconstruction-driven motion estimation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Pan%2C+J">Jiazhen Pan</a>, <a href="/search/eess?searchtype=author&amp;query=Huang%2C+W">Wenqi Huang</a>, <a href="/search/eess?searchtype=author&amp;query=Rueckert%2C+D">Daniel Rueckert</a>, <a href="/search/eess?searchtype=author&amp;query=K%C3%BCstner%2C+T">Thomas K眉stner</a>, <a href="/search/eess?searchtype=author&amp;query=Hammernik%2C+K">Kerstin Hammernik</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2302.02504v2-abstract-short" style="display: inline;"> In cardiac CINE, motion-compensated MR reconstruction (MCMR) is an effective approach to address highly undersampled acquisitions by incorporating motion information between frames. In this work, we propose a novel perspective for addressing the MCMR problem and a more integrated and efficient solution to the MCMR field. Contrary to state-of-the-art (SOTA) MCMR methods which break the original pro&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.02504v2-abstract-full').style.display = 'inline'; document.getElementById('2302.02504v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2302.02504v2-abstract-full" style="display: none;"> In cardiac CINE, motion-compensated MR reconstruction (MCMR) is an effective approach to address highly undersampled acquisitions by incorporating motion information between frames. In this work, we propose a novel perspective for addressing the MCMR problem and a more integrated and efficient solution to the MCMR field. Contrary to state-of-the-art (SOTA) MCMR methods which break the original problem into two sub-optimization problems, i.e. motion estimation and reconstruction, we formulate this problem as a single entity with one single optimization. Our approach is unique in that the motion estimation is directly driven by the ultimate goal, reconstruction, but not by the canonical motion-warping loss (similarity measurement between motion-warped images and target images). We align the objectives of motion estimation and reconstruction, eliminating the drawbacks of artifacts-affected motion estimation and therefore error-propagated reconstruction. Further, we can deliver high-quality reconstruction and realistic motion without applying any regularization/smoothness loss terms, circumventing the non-trivial weighting factor tuning. We evaluate our method on two datasets: 1) an in-house acquired 2D CINE dataset for the retrospective study and 2) the public OCMR cardiac dataset for the prospective study. The conducted experiments indicate that the proposed MCMR framework can deliver artifact-free motion estimation and high-quality MR images even for imaging accelerations up to 20x, outperforming SOTA non-MCMR and MCMR methods in both qualitative and quantitative evaluation across all experiments. The code is available at https://github.com/JZPeterPan/MCMR-Recon-Driven-Motion. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.02504v2-abstract-full').style.display = 'none'; document.getElementById('2302.02504v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 February, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2212.13053">arXiv:2212.13053</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2212.13053">pdf</a>, <a href="https://arxiv.org/ps/2212.13053">ps</a>, <a href="https://arxiv.org/format/2212.13053">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/LRA.2021.3062805">10.1109/LRA.2021.3062805 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Learning-based Predictive Path Following Control for Nonlinear Systems Under Uncertain Disturbances </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Yang%2C+R">Rui Yang</a>, <a href="/search/eess?searchtype=author&amp;query=Zheng%2C+L">Lei Zheng</a>, <a href="/search/eess?searchtype=author&amp;query=Pan%2C+J">Jiesen Pan</a>, <a href="/search/eess?searchtype=author&amp;query=Cheng%2C+H">Hui Cheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2212.13053v1-abstract-short" style="display: inline;"> Accurate path following is challenging for autonomous robots operating in uncertain environments. Adaptive and predictive control strategies are crucial for a nonlinear robotic system to achieve high-performance path following control. In this paper, we propose a novel learning-based predictive control scheme that couples a high-level model predictive path following controller (MPFC) with a low-le&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.13053v1-abstract-full').style.display = 'inline'; document.getElementById('2212.13053v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2212.13053v1-abstract-full" style="display: none;"> Accurate path following is challenging for autonomous robots operating in uncertain environments. Adaptive and predictive control strategies are crucial for a nonlinear robotic system to achieve high-performance path following control. In this paper, we propose a novel learning-based predictive control scheme that couples a high-level model predictive path following controller (MPFC) with a low-level learning-based feedback linearization controller (LB-FBLC) for nonlinear systems under uncertain disturbances. The low-level LB-FBLC utilizes Gaussian Processes to learn the uncertain environmental disturbances online and tracks the reference state accurately with a probabilistic stability guarantee. Meanwhile, the high-level MPFC exploits the linearized system model augmented with a virtual linear path dynamics model to optimize the evolution of path reference targets, and provides the reference states and controls for the low-level LB-FBLC. Simulation results illustrate the effectiveness of the proposed control strategy on a quadrotor path following task under unknown wind disturbances. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.13053v1-abstract-full').style.display = 'none'; document.getElementById('2212.13053v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 December, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 7 figures, accepted for publication in IEEE Robotics and Automation Letters ( Volume: 6, Issue: 2, April 2021)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2212.08479">arXiv:2212.08479</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2212.08479">pdf</a>, <a href="https://arxiv.org/format/2212.08479">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Neural Implicit k-Space for Binning-free Non-Cartesian Cardiac MR Imaging </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Huang%2C+W">Wenqi Huang</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+H">Hongwei Li</a>, <a href="/search/eess?searchtype=author&amp;query=Pan%2C+J">Jiazhen Pan</a>, <a href="/search/eess?searchtype=author&amp;query=Cruz%2C+G">Gastao Cruz</a>, <a href="/search/eess?searchtype=author&amp;query=Rueckert%2C+D">Daniel Rueckert</a>, <a href="/search/eess?searchtype=author&amp;query=Hammernik%2C+K">Kerstin Hammernik</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2212.08479v5-abstract-short" style="display: inline;"> In this work, we propose a novel image reconstruction framework that directly learns a neural implicit representation in k-space for ECG-triggered non-Cartesian Cardiac Magnetic Resonance Imaging (CMR). While existing methods bin acquired data from neighboring time points to reconstruct one phase of the cardiac motion, our framework allows for a continuous, binning-free, and subject-specific k-spa&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.08479v5-abstract-full').style.display = 'inline'; document.getElementById('2212.08479v5-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2212.08479v5-abstract-full" style="display: none;"> In this work, we propose a novel image reconstruction framework that directly learns a neural implicit representation in k-space for ECG-triggered non-Cartesian Cardiac Magnetic Resonance Imaging (CMR). While existing methods bin acquired data from neighboring time points to reconstruct one phase of the cardiac motion, our framework allows for a continuous, binning-free, and subject-specific k-space representation.We assign a unique coordinate that consists of time, coil index, and frequency domain location to each sampled k-space point. We then learn the subject-specific mapping from these unique coordinates to k-space intensities using a multi-layer perceptron with frequency domain regularization. During inference, we obtain a complete k-space for Cartesian coordinates and an arbitrary temporal resolution. A simple inverse Fourier transform recovers the image, eliminating the need for density compensation and costly non-uniform Fourier transforms for non-Cartesian data. This novel imaging framework was tested on 42 radially sampled datasets from 6 subjects. The proposed method outperforms other techniques qualitatively and quantitatively using data from four and one heartbeat(s) and 30 cardiac phases. Our results for one heartbeat reconstruction of 50 cardiac phases show improved artifact removal and spatio-temporal resolution, leveraging the potential for real-time CMR. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.08479v5-abstract-full').style.display = 'none'; document.getElementById('2212.08479v5-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 December, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2212.05805">arXiv:2212.05805</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2212.05805">pdf</a>, <a href="https://arxiv.org/format/2212.05805">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Direct Speech-to-speech Translation without Textual Annotation using Bottleneck Features </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+J">Junhui Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Pan%2C+J">Junjie Pan</a>, <a href="/search/eess?searchtype=author&amp;query=Yin%2C+X">Xiang Yin</a>, <a href="/search/eess?searchtype=author&amp;query=Ma%2C+Z">Zejun Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2212.05805v1-abstract-short" style="display: inline;"> Speech-to-speech translation directly translates a speech utterance to another between different languages, and has great potential in tasks such as simultaneous interpretation. State-of-art models usually contains an auxiliary module for phoneme sequences prediction, and this requires textual annotation of the training dataset. We propose a direct speech-to-speech translation model which can be t&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.05805v1-abstract-full').style.display = 'inline'; document.getElementById('2212.05805v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2212.05805v1-abstract-full" style="display: none;"> Speech-to-speech translation directly translates a speech utterance to another between different languages, and has great potential in tasks such as simultaneous interpretation. State-of-art models usually contains an auxiliary module for phoneme sequences prediction, and this requires textual annotation of the training dataset. We propose a direct speech-to-speech translation model which can be trained without any textual annotation or content information. Instead of introducing an auxiliary phoneme prediction task in the model, we propose to use bottleneck features as intermediate training objectives for our model to ensure the translation performance of the system. Experiments on Mandarin-Cantonese speech translation demonstrate the feasibility of the proposed approach and the performance can match a cascaded system with respect of translation and synthesis qualities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.05805v1-abstract-full').style.display = 'none'; document.getElementById('2212.05805v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 December, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">4 pages, 3 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2212.03482">arXiv:2212.03482</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2212.03482">pdf</a>, <a href="https://arxiv.org/format/2212.03482">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Improved Speech Pre-Training with Supervision-Enhanced Acoustic Unit </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Li%2C+P">Pengcheng Li</a>, <a href="/search/eess?searchtype=author&amp;query=Wan%2C+G">Genshun Wan</a>, <a href="/search/eess?searchtype=author&amp;query=Ding%2C+F">Fenglin Ding</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+H">Hang Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Gao%2C+J">Jianqing Gao</a>, <a href="/search/eess?searchtype=author&amp;query=Pan%2C+J">Jia Pan</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+C">Cong Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2212.03482v1-abstract-short" style="display: inline;"> Speech pre-training has shown great success in learning useful and general latent representations from large-scale unlabeled data. Based on a well-designed self-supervised learning pattern, pre-trained models can be used to serve lots of downstream speech tasks such as automatic speech recognition. In order to take full advantage of the labed data in low resource task, we present an improved pre-t&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.03482v1-abstract-full').style.display = 'inline'; document.getElementById('2212.03482v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2212.03482v1-abstract-full" style="display: none;"> Speech pre-training has shown great success in learning useful and general latent representations from large-scale unlabeled data. Based on a well-designed self-supervised learning pattern, pre-trained models can be used to serve lots of downstream speech tasks such as automatic speech recognition. In order to take full advantage of the labed data in low resource task, we present an improved pre-training method by introducing a supervision-enhanced acoustic unit (SEAU) pattern to intensify the expression of comtext information and ruduce the training cost. Encoder representations extracted from the SEAU pattern are used to generate more representative target units for HuBERT pre-training process. The proposed method, named SeHuBERT, achieves a relative word error rate reductions of 10.5% and 4.9% comared with the standard HuBERT on Turkmen speech recognition task with 500 hours and 100 hours fine-tuning data respectively. Extended to more languages and more data, SeHuBERT can aslo achieve a relative word error rate reductions of approximately 10% at half of the training cost compared with HuBERT. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.03482v1-abstract-full').style.display = 'none'; document.getElementById('2212.03482v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 December, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to ICASSP 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2212.03480">arXiv:2212.03480</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2212.03480">pdf</a>, <a href="https://arxiv.org/format/2212.03480">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Progressive Multi-Scale Self-Supervised Learning for Speech Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Wan%2C+G">Genshun Wan</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+T">Tan Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+H">Hang Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Pan%2C+J">Jia Pan</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+C">Cong Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Ye%2C+Z">Zhongfu Ye</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2212.03480v1-abstract-short" style="display: inline;"> Self-supervised learning (SSL) models have achieved considerable improvements in automatic speech recognition (ASR). In addition, ASR performance could be further improved if the model is dedicated to audio content information learning theoretically. To this end, we propose a progressive multi-scale self-supervised learning (PMS-SSL) method, which uses fine-grained target sets to compute SSL loss&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.03480v1-abstract-full').style.display = 'inline'; document.getElementById('2212.03480v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2212.03480v1-abstract-full" style="display: none;"> Self-supervised learning (SSL) models have achieved considerable improvements in automatic speech recognition (ASR). In addition, ASR performance could be further improved if the model is dedicated to audio content information learning theoretically. To this end, we propose a progressive multi-scale self-supervised learning (PMS-SSL) method, which uses fine-grained target sets to compute SSL loss at top layer while uses coarse-grained target sets at intermediate layers. Furthermore, PMS-SSL introduces multi-scale structure into multi-head self-attention for better speech representation, which restricts the attention area into a large scope at higher layers while restricts the attention area into a small scope at lower layers. Experiments on Librispeech dataset indicate the effectiveness of our proposed method. Compared with HuBERT, PMS-SSL achieves 13.7% / 12.7% relative WER reduction on test other evaluation subsets respectively when fine-tuned on 10hours / 100hours subsets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.03480v1-abstract-full').style.display = 'none'; document.getElementById('2212.03480v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 December, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to ICASSP 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2212.03476">arXiv:2212.03476</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2212.03476">pdf</a>, <a href="https://arxiv.org/ps/2212.03476">ps</a>, <a href="https://arxiv.org/format/2212.03476">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Improved Self-Supervised Multilingual Speech Representation Learning Combined with Auxiliary Language Information </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Ding%2C+F">Fenglin Ding</a>, <a href="/search/eess?searchtype=author&amp;query=Wan%2C+G">Genshun Wan</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+P">Pengcheng Li</a>, <a href="/search/eess?searchtype=author&amp;query=Pan%2C+J">Jia Pan</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+C">Cong Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2212.03476v1-abstract-short" style="display: inline;"> Multilingual end-to-end models have shown great improvement over monolingual systems. With the development of pre-training methods on speech, self-supervised multilingual speech representation learning like XLSR has shown success in improving the performance of multilingual automatic speech recognition (ASR). However, similar to the supervised learning, multilingual pre-training may also suffer fr&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.03476v1-abstract-full').style.display = 'inline'; document.getElementById('2212.03476v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2212.03476v1-abstract-full" style="display: none;"> Multilingual end-to-end models have shown great improvement over monolingual systems. With the development of pre-training methods on speech, self-supervised multilingual speech representation learning like XLSR has shown success in improving the performance of multilingual automatic speech recognition (ASR). However, similar to the supervised learning, multilingual pre-training may also suffer from language interference and further affect the application of multilingual system. In this paper, we introduce several techniques for improving self-supervised multilingual pre-training by leveraging auxiliary language information, including the language adversarial training, language embedding and language adaptive training during the pre-training stage. We conduct experiments on a multilingual ASR task consisting of 16 languages. Our experimental results demonstrate 14.3% relative gain over the standard XLSR model, and 19.8% relative gain over the no pre-training multilingual model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.03476v1-abstract-full').style.display = 'none'; document.getElementById('2212.03476v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 December, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Subimitted to ICASSP 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2212.02782">arXiv:2212.02782</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2212.02782">pdf</a>, <a href="https://arxiv.org/format/2212.02782">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Self-Supervised Audio-Visual Speech Representations Learning By Multimodal Self-Distillation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+J">Jing-Xuan Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Wan%2C+G">Genshun Wan</a>, <a href="/search/eess?searchtype=author&amp;query=Ling%2C+Z">Zhen-Hua Ling</a>, <a href="/search/eess?searchtype=author&amp;query=Pan%2C+J">Jia Pan</a>, <a href="/search/eess?searchtype=author&amp;query=Gao%2C+J">Jianqing Gao</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+C">Cong Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2212.02782v1-abstract-short" style="display: inline;"> In this work, we present a novel method, named AV2vec, for learning audio-visual speech representations by multimodal self-distillation. AV2vec has a student and a teacher module, in which the student performs a masked latent feature regression task using the multimodal target features generated online by the teacher. The parameters of the teacher model are a momentum update of the student. Since&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.02782v1-abstract-full').style.display = 'inline'; document.getElementById('2212.02782v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2212.02782v1-abstract-full" style="display: none;"> In this work, we present a novel method, named AV2vec, for learning audio-visual speech representations by multimodal self-distillation. AV2vec has a student and a teacher module, in which the student performs a masked latent feature regression task using the multimodal target features generated online by the teacher. The parameters of the teacher model are a momentum update of the student. Since our target features are generated online, AV2vec needs no iteration step like AV-HuBERT and the total training time cost is reduced to less than one-fifth. We further propose AV2vec-MLM in this study, which augments AV2vec with a masked language model (MLM)-style loss using multitask learning. Our experimental results show that AV2vec achieved comparable performance to the AV-HuBERT baseline. When combined with an MLM-style loss, AV2vec-MLM outperformed baselines and achieved the best performance on the downstream tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.02782v1-abstract-full').style.display = 'none'; document.getElementById('2212.02782v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 December, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">submitted to ICASSP 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2211.05910">arXiv:2211.05910</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2211.05910">pdf</a>, <a href="https://arxiv.org/format/2211.05910">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Efficient and Accurate Quantized Image Super-Resolution on Mobile NPUs, Mobile AI &amp; AIM 2022 challenge: Report </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Ignatov%2C+A">Andrey Ignatov</a>, <a href="/search/eess?searchtype=author&amp;query=Timofte%2C+R">Radu Timofte</a>, <a href="/search/eess?searchtype=author&amp;query=Denna%2C+M">Maurizio Denna</a>, <a href="/search/eess?searchtype=author&amp;query=Younes%2C+A">Abdel Younes</a>, <a href="/search/eess?searchtype=author&amp;query=Gankhuyag%2C+G">Ganzorig Gankhuyag</a>, <a href="/search/eess?searchtype=author&amp;query=Huh%2C+J">Jingang Huh</a>, <a href="/search/eess?searchtype=author&amp;query=Kim%2C+M+K">Myeong Kyun Kim</a>, <a href="/search/eess?searchtype=author&amp;query=Yoon%2C+K">Kihwan Yoon</a>, <a href="/search/eess?searchtype=author&amp;query=Moon%2C+H">Hyeon-Cheol Moon</a>, <a href="/search/eess?searchtype=author&amp;query=Lee%2C+S">Seungho Lee</a>, <a href="/search/eess?searchtype=author&amp;query=Choe%2C+Y">Yoonsik Choe</a>, <a href="/search/eess?searchtype=author&amp;query=Jeong%2C+J">Jinwoo Jeong</a>, <a href="/search/eess?searchtype=author&amp;query=Kim%2C+S">Sungjei Kim</a>, <a href="/search/eess?searchtype=author&amp;query=Smyl%2C+M">Maciej Smyl</a>, <a href="/search/eess?searchtype=author&amp;query=Latkowski%2C+T">Tomasz Latkowski</a>, <a href="/search/eess?searchtype=author&amp;query=Kubik%2C+P">Pawel Kubik</a>, <a href="/search/eess?searchtype=author&amp;query=Sokolski%2C+M">Michal Sokolski</a>, <a href="/search/eess?searchtype=author&amp;query=Ma%2C+Y">Yujie Ma</a>, <a href="/search/eess?searchtype=author&amp;query=Chao%2C+J">Jiahao Chao</a>, <a href="/search/eess?searchtype=author&amp;query=Zhou%2C+Z">Zhou Zhou</a>, <a href="/search/eess?searchtype=author&amp;query=Gao%2C+H">Hongfan Gao</a>, <a href="/search/eess?searchtype=author&amp;query=Yang%2C+Z">Zhengfeng Yang</a>, <a href="/search/eess?searchtype=author&amp;query=Zeng%2C+Z">Zhenbing Zeng</a>, <a href="/search/eess?searchtype=author&amp;query=Zhuge%2C+Z">Zhengyang Zhuge</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+C">Chenghua Li</a> , et al. (71 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2211.05910v1-abstract-short" style="display: inline;"> Image super-resolution is a common task on mobile and IoT devices, where one often needs to upscale and enhance low-resolution images and video frames. While numerous solutions have been proposed for this problem in the past, they are usually not compatible with low-power mobile NPUs having many computational and memory constraints. In this Mobile AI challenge, we address this problem and propose&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.05910v1-abstract-full').style.display = 'inline'; document.getElementById('2211.05910v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2211.05910v1-abstract-full" style="display: none;"> Image super-resolution is a common task on mobile and IoT devices, where one often needs to upscale and enhance low-resolution images and video frames. While numerous solutions have been proposed for this problem in the past, they are usually not compatible with low-power mobile NPUs having many computational and memory constraints. In this Mobile AI challenge, we address this problem and propose the participants to design an efficient quantized image super-resolution solution that can demonstrate a real-time performance on mobile NPUs. The participants were provided with the DIV2K dataset and trained INT8 models to do a high-quality 3X image upscaling. The runtime of all models was evaluated on the Synaptics VS680 Smart Home board with a dedicated edge NPU capable of accelerating quantized neural networks. All proposed solutions are fully compatible with the above NPU, demonstrating an up to 60 FPS rate when reconstructing Full HD resolution images. A detailed description of all models developed in the challenge is provided in this paper. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.05910v1-abstract-full').style.display = 'none'; document.getElementById('2211.05910v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">arXiv admin note: text overlap with arXiv:2105.07825, arXiv:2105.08826, arXiv:2211.04470, arXiv:2211.03885, arXiv:2211.05256</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Pan%2C+J&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Pan%2C+J&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Pan%2C+J&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Pan%2C+J&amp;start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

Pages: 1 2 3 4 5 6 7 8 9 10