CINXE.COM

Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1&ndash;50 of 74 results for author: <span class="mathjax">Guo, P</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> <div class="content"> <form method="GET" action="/search/eess" aria-role="search"> Searching in archive <strong>eess</strong>. <a href="/search/?searchtype=author&amp;query=Guo%2C+P">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Guo, P"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Guo%2C+P&amp;terms-0-field=author&amp;size=50&amp;order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Guo, P"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Guo%2C+P&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Guo%2C+P&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Guo%2C+P&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05845">arXiv:2502.05845</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.05845">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Exploiting the Hidden Capacity of MMC Through Accurate Quantification of Modulation Indices </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Sun%2C+Q">Qianhao Sun</a>, <a href="/search/eess?searchtype=author&amp;query=Meng%2C+J">Jingwei Meng</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+R">Ruofan Li</a>, <a href="/search/eess?searchtype=author&amp;query=Xia%2C+M">Mingchao Xia</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+Q">Qifang Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Zhou%2C+J">Jiejie Zhou</a>, <a href="/search/eess?searchtype=author&amp;query=Fan%2C+M">Meiqi Fan</a>, <a href="/search/eess?searchtype=author&amp;query=Guo%2C+P">Peiqian Guo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05845v1-abstract-short" style="display: inline;"> The modular multilevel converter (MMC) has become increasingly important in voltage-source converter-based high-voltage direct current (VSC-HVDC) systems. Direct and indirect modulation are widely used as mainstream modulation techniques in MMCs. However, due to the challenge of quantitatively evaluating the operation of different modulation schemes, the academic and industrial communities still h&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05845v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05845v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05845v1-abstract-full" style="display: none;"> The modular multilevel converter (MMC) has become increasingly important in voltage-source converter-based high-voltage direct current (VSC-HVDC) systems. Direct and indirect modulation are widely used as mainstream modulation techniques in MMCs. However, due to the challenge of quantitatively evaluating the operation of different modulation schemes, the academic and industrial communities still hold differing opinions on their performance. To address this controversy, this paper employs the state-of-the-art computational methods and quantitative metrics to compare the performance among different modulation schemes. The findings indicate that direct modulation offers superior modulation potential for MMCs, highlighting its higher ac voltage output capability and broader linear PQ operation region. Conversely, indirect modulation is disadvantaged in linear modulation, which indicates inferior output voltage capability. Furthermore, this paper delves into the conditions whereby direct and indirect modulation techniques become equivalent in steady-state. The study findings suggest that the modulation capability of direct modulation is the same as that of indirect modulation in steady-state when additional controls, including closed-loop capacitor voltage control and circulating current suppression control (CCSC), are simultaneously active. Simulation and experiments verify the correctness and validity. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05845v1-abstract-full').style.display = 'none'; document.getElementById('2502.05845v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.13306">arXiv:2501.13306</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2501.13306">pdf</a>, <a href="https://arxiv.org/format/2501.13306">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> OSUM: Advancing Open Speech Understanding Models with Limited Resources in Academia </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Geng%2C+X">Xuelong Geng</a>, <a href="/search/eess?searchtype=author&amp;query=Wei%2C+K">Kun Wei</a>, <a href="/search/eess?searchtype=author&amp;query=Shao%2C+Q">Qijie Shao</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+S">Shuiyun Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Lin%2C+Z">Zhennan Lin</a>, <a href="/search/eess?searchtype=author&amp;query=Zhao%2C+Z">Zhixian Zhao</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+G">Guojian Li</a>, <a href="/search/eess?searchtype=author&amp;query=Tian%2C+W">Wenjie Tian</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+P">Peikun Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+Y">Yangze Li</a>, <a href="/search/eess?searchtype=author&amp;query=Guo%2C+P">Pengcheng Guo</a>, <a href="/search/eess?searchtype=author&amp;query=Shao%2C+M">Mingchen Shao</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+S">Shuiyuan Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Cao%2C+Y">Yuang Cao</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+C">Chengyou Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+T">Tianyi Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Dai%2C+Y">Yuhang Dai</a>, <a href="/search/eess?searchtype=author&amp;query=Zhu%2C+X">Xinfa Zhu</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+Y">Yue Li</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+L">Li Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Xie%2C+L">Lei Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.13306v2-abstract-short" style="display: inline;"> Large Language Models (LLMs) have made significant progress in various downstream tasks, inspiring the development of Speech Understanding Language Models (SULMs) to enable comprehensive speech-based interactions. However, most advanced SULMs are developed by the industry, leveraging large-scale datasets and computational resources that are not readily available to the academic community. Moreover&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13306v2-abstract-full').style.display = 'inline'; document.getElementById('2501.13306v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.13306v2-abstract-full" style="display: none;"> Large Language Models (LLMs) have made significant progress in various downstream tasks, inspiring the development of Speech Understanding Language Models (SULMs) to enable comprehensive speech-based interactions. However, most advanced SULMs are developed by the industry, leveraging large-scale datasets and computational resources that are not readily available to the academic community. Moreover, the lack of transparency in training details creates additional barriers to further innovation. In this study, we present OSUM, an Open Speech Understanding Model designed to explore the potential of training SLUMs under constrained academic resources. The OSUM model combines a Whisper encoder with a Qwen2 LLM and supports a wide range of speech tasks, including speech recognition (ASR), speech recognition with timestamps (SRWT), vocal event detection (VED), speech emotion recognition (SER), speaking style recognition (SSR), speaker gender classification (SGC), speaker age prediction (SAP), and speech-to-text chat (STTC). By employing an ASR+X training strategy, OSUM achieves efficient and stable multi-task training by simultaneously optimizing ASR alongside target tasks. Beyond delivering strong performance, OSUM emphasizes transparency by providing openly available data preparation and training methodologies, offering valuable insights and practical guidance for the academic community. By doing so, we aim to accelerate research and innovation in advanced SULM technologies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13306v2-abstract-full').style.display = 'none'; document.getElementById('2501.13306v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">OSUM Technical Report v2. The experimental results reported herein differ from those in v1 because of adding new data and training in more steps</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.05127">arXiv:2501.05127</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2501.05127">pdf</a>, <a href="https://arxiv.org/format/2501.05127">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> DiffAttack: Diffusion-based Timbre-reserved Adversarial Attack in Speaker Identification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Wang%2C+Q">Qing Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Yao%2C+J">Jixun Yao</a>, <a href="/search/eess?searchtype=author&amp;query=Sun%2C+Z">Zhaokai Sun</a>, <a href="/search/eess?searchtype=author&amp;query=Guo%2C+P">Pengcheng Guo</a>, <a href="/search/eess?searchtype=author&amp;query=Xie%2C+L">Lei Xie</a>, <a href="/search/eess?searchtype=author&amp;query=Hansen%2C+J+H+L">John H. L. Hansen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.05127v1-abstract-short" style="display: inline;"> Being a form of biometric identification, the security of the speaker identification (SID) system is of utmost importance. To better understand the robustness of SID systems, we aim to perform more realistic attacks in SID, which are challenging for both humans and machines to detect. In this study, we propose DiffAttack, a novel timbre-reserved adversarial attack approach that exploits the capabi&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.05127v1-abstract-full').style.display = 'inline'; document.getElementById('2501.05127v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.05127v1-abstract-full" style="display: none;"> Being a form of biometric identification, the security of the speaker identification (SID) system is of utmost importance. To better understand the robustness of SID systems, we aim to perform more realistic attacks in SID, which are challenging for both humans and machines to detect. In this study, we propose DiffAttack, a novel timbre-reserved adversarial attack approach that exploits the capability of a diffusion-based voice conversion (DiffVC) model to generate adversarial fake audio with distinct target speaker attribution. By introducing adversarial constraints into the generative process of the diffusion-based voice conversion model, we craft fake samples that effectively mislead target models while preserving speaker-wise characteristics. Specifically, inspired by the use of randomly sampled Gaussian noise in conventional adversarial attacks and diffusion processes, we incorporate adversarial constraints into the reverse diffusion process. These constraints subtly guide the reverse diffusion process toward aligning with the target speaker distribution. Our experiments on the LibriTTS dataset indicate that DiffAttack significantly improves the attack success rate compared to vanilla DiffVC and other methods. Moreover, objective and subjective evaluations demonstrate that introducing adversarial constraints does not compromise the speech quality generated by the DiffVC model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.05127v1-abstract-full').style.display = 'none'; document.getElementById('2501.05127v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages,4 figures, accepted by ICASSP 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.18589">arXiv:2412.18589</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.18589">pdf</a>, <a href="https://arxiv.org/format/2412.18589">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Text-Driven Tumor Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Li%2C+X">Xinran Li</a>, <a href="/search/eess?searchtype=author&amp;query=Shuai%2C+Y">Yi Shuai</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+C">Chen Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+Q">Qi Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Wu%2C+Q">Qilong Wu</a>, <a href="/search/eess?searchtype=author&amp;query=Guo%2C+P">Pengfei Guo</a>, <a href="/search/eess?searchtype=author&amp;query=Yang%2C+D">Dong Yang</a>, <a href="/search/eess?searchtype=author&amp;query=Zhao%2C+C">Can Zhao</a>, <a href="/search/eess?searchtype=author&amp;query=Bassi%2C+P+R+A+S">Pedro R. A. S. Bassi</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+D">Daguang Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+K">Kang Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Yang%2C+Y">Yang Yang</a>, <a href="/search/eess?searchtype=author&amp;query=Yuille%2C+A">Alan Yuille</a>, <a href="/search/eess?searchtype=author&amp;query=Zhou%2C+Z">Zongwei Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.18589v1-abstract-short" style="display: inline;"> Tumor synthesis can generate examples that AI often misses or over-detects, improving AI performance by training on these challenging cases. However, existing synthesis methods, which are typically unconditional -- generating images from random variables -- or conditioned only by tumor shapes, lack controllability over specific tumor characteristics such as texture, heterogeneity, boundaries, and&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18589v1-abstract-full').style.display = 'inline'; document.getElementById('2412.18589v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.18589v1-abstract-full" style="display: none;"> Tumor synthesis can generate examples that AI often misses or over-detects, improving AI performance by training on these challenging cases. However, existing synthesis methods, which are typically unconditional -- generating images from random variables -- or conditioned only by tumor shapes, lack controllability over specific tumor characteristics such as texture, heterogeneity, boundaries, and pathology type. As a result, the generated tumors may be overly similar or duplicates of existing training data, failing to effectively address AI&#39;s weaknesses. We propose a new text-driven tumor synthesis approach, termed TextoMorph, that provides textual control over tumor characteristics. This is particularly beneficial for examples that confuse the AI the most, such as early tumor detection (increasing Sensitivity by +8.5%), tumor segmentation for precise radiotherapy (increasing DSC by +6.3%), and classification between benign and malignant tumors (improving Sensitivity by +8.2%). By incorporating text mined from radiology reports into the synthesis process, we increase the variability and controllability of the synthetic tumors to target AI&#39;s failure cases more precisely. Moreover, TextoMorph uses contrastive learning across different texts and CT scans, significantly reducing dependence on scarce image-report pairs (only 141 pairs used in this study) by leveraging a large corpus of 34,035 radiology reports. Finally, we have developed rigorous tests to evaluate synthetic tumors, including Text-Driven Visual Turing Test and Radiomics Pattern Analysis, showing that our synthetic tumors is realistic and diverse in texture, heterogeneity, boundaries, and pathology. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18589v1-abstract-full').style.display = 'none'; document.getElementById('2412.18589v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.05589">arXiv:2412.05589</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.05589">pdf</a>, <a href="https://arxiv.org/format/2412.05589">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> SQ-Whisper: Speaker-Querying based Whisper Model for Target-Speaker ASR </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Guo%2C+P">Pengcheng Guo</a>, <a href="/search/eess?searchtype=author&amp;query=Chang%2C+X">Xuankai Chang</a>, <a href="/search/eess?searchtype=author&amp;query=Lv%2C+H">Hang Lv</a>, <a href="/search/eess?searchtype=author&amp;query=Watanabe%2C+S">Shinji Watanabe</a>, <a href="/search/eess?searchtype=author&amp;query=Xie%2C+L">Lei Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.05589v1-abstract-short" style="display: inline;"> Benefiting from massive and diverse data sources, speech foundation models exhibit strong generalization and knowledge transfer capabilities to a wide range of downstream tasks. However, a limitation arises from their exclusive handling of single-speaker speech input, making them ineffective in recognizing multi-speaker overlapped speech, a common occurrence in real-world scenarios. In this study,&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05589v1-abstract-full').style.display = 'inline'; document.getElementById('2412.05589v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.05589v1-abstract-full" style="display: none;"> Benefiting from massive and diverse data sources, speech foundation models exhibit strong generalization and knowledge transfer capabilities to a wide range of downstream tasks. However, a limitation arises from their exclusive handling of single-speaker speech input, making them ineffective in recognizing multi-speaker overlapped speech, a common occurrence in real-world scenarios. In this study, we delve into the adaptation of speech foundation models to eliminate interfering speakers from overlapping speech and perform target-speaker automatic speech recognition (TS-ASR). Initially, we utilize the Whisper model as the foundation for adaptation and conduct a thorough comparison of its integration with existing target-speaker adaptation techniques. We then propose an innovative model termed Speaker-Querying Whisper (SQ-Whisper), which employs a set number of trainable queries to capture speaker prompts from overlapping speech based on target-speaker enrollment. These prompts serve to steer the model in extracting speaker-specific features and accurately recognizing target-speaker transcriptions. Experimental results demonstrate that our approach effectively adapts the pre-trained speech foundation model to TS-ASR. Compared with the robust TS-HuBERT model, the proposed SQ-Whisper significantly improves performance, yielding up to 15% and 10% relative reductions in word error rates (WERs) on the Libri2Mix and WSJ0-2Mix datasets, respectively. With data augmentation, we establish new state-of-the-art WERs of 14.6% on the Libri2Mix Test set and 4.4% on the WSJ0-2Mix Test set. Furthermore, we evaluate our model on the real-world AMI meeting dataset, which shows consistent improvement over other adaptation methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05589v1-abstract-full').style.display = 'none'; document.getElementById('2412.05589v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by IEEE/ACM TASLP</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.19660">arXiv:2409.19660</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.19660">pdf</a>, <a href="https://arxiv.org/format/2409.19660">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> All-in-One Image Coding for Joint Human-Machine Vision with Multi-Path Aggregation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+X">Xu Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Guo%2C+P">Peiyao Guo</a>, <a href="/search/eess?searchtype=author&amp;query=Lu%2C+M">Ming Lu</a>, <a href="/search/eess?searchtype=author&amp;query=Ma%2C+Z">Zhan Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.19660v1-abstract-short" style="display: inline;"> Image coding for multi-task applications, catering to both human perception and machine vision, has been extensively investigated. Existing methods often rely on multiple task-specific encoder-decoder pairs, leading to high overhead of parameter and bitrate usage, or face challenges in multi-objective optimization under a unified representation, failing to achieve both performance and efficiency.&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19660v1-abstract-full').style.display = 'inline'; document.getElementById('2409.19660v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.19660v1-abstract-full" style="display: none;"> Image coding for multi-task applications, catering to both human perception and machine vision, has been extensively investigated. Existing methods often rely on multiple task-specific encoder-decoder pairs, leading to high overhead of parameter and bitrate usage, or face challenges in multi-objective optimization under a unified representation, failing to achieve both performance and efficiency. To this end, we propose Multi-Path Aggregation (MPA) integrated into existing coding models for joint human-machine vision, unifying the feature representation with an all-in-one architecture. MPA employs a predictor to allocate latent features among task-specific paths based on feature importance varied across tasks, maximizing the utility of shared features while preserving task-specific features for subsequent refinement. Leveraging feature correlations, we develop a two-stage optimization strategy to alleviate multi-task performance degradation. Upon the reuse of shared features, as low as 1.89% parameters are further augmented and fine-tuned for a specific task, which completely avoids extensive optimization of the entire model. Experimental results show that MPA achieves performance comparable to state-of-the-art methods in both task-specific and multi-objective optimization across human viewing and machine analysis tasks. Moreover, our all-in-one design supports seamless transitions between human- and machine-oriented reconstruction, enabling task-controllable interpretation without altering the unified model. Code is available at https://github.com/NJUVISION/MPA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19660v1-abstract-full').style.display = 'none'; document.getElementById('2409.19660v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.11169">arXiv:2409.11169</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.11169">pdf</a>, <a href="https://arxiv.org/format/2409.11169">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MAISI: Medical AI for Synthetic Imaging </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Guo%2C+P">Pengfei Guo</a>, <a href="/search/eess?searchtype=author&amp;query=Zhao%2C+C">Can Zhao</a>, <a href="/search/eess?searchtype=author&amp;query=Yang%2C+D">Dong Yang</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+Z">Ziyue Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Nath%2C+V">Vishwesh Nath</a>, <a href="/search/eess?searchtype=author&amp;query=Tang%2C+Y">Yucheng Tang</a>, <a href="/search/eess?searchtype=author&amp;query=Simon%2C+B">Benjamin Simon</a>, <a href="/search/eess?searchtype=author&amp;query=Belue%2C+M">Mason Belue</a>, <a href="/search/eess?searchtype=author&amp;query=Harmon%2C+S">Stephanie Harmon</a>, <a href="/search/eess?searchtype=author&amp;query=Turkbey%2C+B">Baris Turkbey</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+D">Daguang Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.11169v2-abstract-short" style="display: inline;"> Medical imaging analysis faces challenges such as data scarcity, high annotation costs, and privacy concerns. This paper introduces the Medical AI for Synthetic Imaging (MAISI), an innovative approach using the diffusion model to generate synthetic 3D computed tomography (CT) images to address those challenges. MAISI leverages the foundation volume compression network and the latent diffusion mode&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.11169v2-abstract-full').style.display = 'inline'; document.getElementById('2409.11169v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.11169v2-abstract-full" style="display: none;"> Medical imaging analysis faces challenges such as data scarcity, high annotation costs, and privacy concerns. This paper introduces the Medical AI for Synthetic Imaging (MAISI), an innovative approach using the diffusion model to generate synthetic 3D computed tomography (CT) images to address those challenges. MAISI leverages the foundation volume compression network and the latent diffusion model to produce high-resolution CT images (up to a landmark volume dimension of 512 x 512 x 768 ) with flexible volume dimensions and voxel spacing. By incorporating ControlNet, MAISI can process organ segmentation, including 127 anatomical structures, as additional conditions and enables the generation of accurately annotated synthetic images that can be used for various downstream tasks. Our experiment results show that MAISI&#39;s capabilities in generating realistic, anatomically accurate images for diverse regions and conditions reveal its promising potential to mitigate challenges using synthetic data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.11169v2-abstract-full').style.display = 'none'; document.getElementById('2409.11169v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">WACV25 accepted. https://monai.io/research/maisi</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.10076">arXiv:2409.10076</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.10076">pdf</a>, <a href="https://arxiv.org/format/2409.10076">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Optimizing Dysarthria Wake-Up Word Spotting: An End-to-End Approach for SLT 2024 LRDWWS Challenge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Liu%2C+S">Shuiyun Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Kong%2C+Y">Yuxiang Kong</a>, <a href="/search/eess?searchtype=author&amp;query=Guo%2C+P">Pengcheng Guo</a>, <a href="/search/eess?searchtype=author&amp;query=Zhuang%2C+W">Weiji Zhuang</a>, <a href="/search/eess?searchtype=author&amp;query=Gao%2C+P">Peng Gao</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+Y">Yujun Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Xie%2C+L">Lei Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.10076v1-abstract-short" style="display: inline;"> Speech has emerged as a widely embraced user interface across diverse applications. However, for individuals with dysarthria, the inherent variability in their speech poses significant challenges. This paper presents an end-to-end Pretrain-based Dual-filter Dysarthria Wake-up word Spotting (PD-DWS) system for the SLT 2024 Low-Resource Dysarthria Wake-Up Word Spotting Challenge. Specifically, our s&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10076v1-abstract-full').style.display = 'inline'; document.getElementById('2409.10076v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.10076v1-abstract-full" style="display: none;"> Speech has emerged as a widely embraced user interface across diverse applications. However, for individuals with dysarthria, the inherent variability in their speech poses significant challenges. This paper presents an end-to-end Pretrain-based Dual-filter Dysarthria Wake-up word Spotting (PD-DWS) system for the SLT 2024 Low-Resource Dysarthria Wake-Up Word Spotting Challenge. Specifically, our system improves performance from two key perspectives: audio modeling and dual-filter strategy. For audio modeling, we propose an innovative 2branch-d2v2 model based on the pre-trained data2vec2 (d2v2), which can simultaneously model automatic speech recognition (ASR) and wake-up word spotting (WWS) tasks through a unified multi-task finetuning paradigm. Additionally, a dual-filter strategy is introduced to reduce the false accept rate (FAR) while maintaining the same false reject rate (FRR). Experimental results demonstrate that our PD-DWS system achieves an FAR of 0.00321 and an FRR of 0.005, with a total score of 0.00821 on the test-B eval set, securing first place in the challenge. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10076v1-abstract-full').style.display = 'none'; document.getElementById('2409.10076v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, Accepted to SLT 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.04173">arXiv:2409.04173</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.04173">pdf</a>, <a href="https://arxiv.org/format/2409.04173">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> NPU-NTU System for Voice Privacy 2024 Challenge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Yao%2C+J">Jixun Yao</a>, <a href="/search/eess?searchtype=author&amp;query=Kuzmin%2C+N">Nikita Kuzmin</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+Q">Qing Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Guo%2C+P">Pengcheng Guo</a>, <a href="/search/eess?searchtype=author&amp;query=Ning%2C+Z">Ziqian Ning</a>, <a href="/search/eess?searchtype=author&amp;query=Guo%2C+D">Dake Guo</a>, <a href="/search/eess?searchtype=author&amp;query=Lee%2C+K+A">Kong Aik Lee</a>, <a href="/search/eess?searchtype=author&amp;query=Chng%2C+E">Eng-Siong Chng</a>, <a href="/search/eess?searchtype=author&amp;query=Xie%2C+L">Lei Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.04173v2-abstract-short" style="display: inline;"> Speaker anonymization is an effective privacy protection solution that conceals the speaker&#39;s identity while preserving the linguistic content and paralinguistic information of the original speech. To establish a fair benchmark and facilitate comparison of speaker anonymization systems, the VoicePrivacy Challenge (VPC) was held in 2020 and 2022, with a new edition planned for 2024. In this paper,&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.04173v2-abstract-full').style.display = 'inline'; document.getElementById('2409.04173v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.04173v2-abstract-full" style="display: none;"> Speaker anonymization is an effective privacy protection solution that conceals the speaker&#39;s identity while preserving the linguistic content and paralinguistic information of the original speech. To establish a fair benchmark and facilitate comparison of speaker anonymization systems, the VoicePrivacy Challenge (VPC) was held in 2020 and 2022, with a new edition planned for 2024. In this paper, we describe our proposed speaker anonymization system for VPC 2024. Our system employs a disentangled neural codec architecture and a serial disentanglement strategy to gradually disentangle the global speaker identity and time-variant linguistic content and paralinguistic information. We introduce multiple distillation methods to disentangle linguistic content, speaker identity, and emotion. These methods include semantic distillation, supervised speaker distillation, and frame-level emotion distillation. Based on these distillations, we anonymize the original speaker identity using a weighted sum of a set of candidate speaker identities and a randomly generated speaker identity. Our system achieves the best trade-off of privacy protection and emotion preservation in VPC 2024. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.04173v2-abstract-full').style.display = 'none'; document.getElementById('2409.04173v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">System description for VPC 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.10680">arXiv:2408.10680</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.10680">pdf</a>, <a href="https://arxiv.org/format/2408.10680">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Towards Rehearsal-Free Multilingual ASR: A LoRA-based Case Study on Whisper </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Xu%2C+T">Tianyi Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Huang%2C+K">Kaixun Huang</a>, <a href="/search/eess?searchtype=author&amp;query=Guo%2C+P">Pengcheng Guo</a>, <a href="/search/eess?searchtype=author&amp;query=Zhou%2C+Y">Yu Zhou</a>, <a href="/search/eess?searchtype=author&amp;query=Huang%2C+L">Longtao Huang</a>, <a href="/search/eess?searchtype=author&amp;query=Xue%2C+H">Hui Xue</a>, <a href="/search/eess?searchtype=author&amp;query=Xie%2C+L">Lei Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.10680v1-abstract-short" style="display: inline;"> Pre-trained multilingual speech foundation models, like Whisper, have shown impressive performance across different languages. However, adapting these models to new or specific languages is computationally extensive and faces catastrophic forgetting problems. Addressing these issues, our study investigates strategies to enhance the model on new languages in the absence of original training data, w&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10680v1-abstract-full').style.display = 'inline'; document.getElementById('2408.10680v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.10680v1-abstract-full" style="display: none;"> Pre-trained multilingual speech foundation models, like Whisper, have shown impressive performance across different languages. However, adapting these models to new or specific languages is computationally extensive and faces catastrophic forgetting problems. Addressing these issues, our study investigates strategies to enhance the model on new languages in the absence of original training data, while also preserving the established performance on the original languages. Specifically, we first compare various LoRA-based methods to find out their vulnerability to forgetting. To mitigate this issue, we propose to leverage the LoRA parameters from the original model for approximate orthogonal gradient descent on the new samples. Additionally, we also introduce a learnable rank coefficient to allocate trainable parameters for more efficient training. Our experiments with a Chinese Whisper model (for Uyghur and Tibetan) yield better results with a more compact parameter set. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10680v1-abstract-full').style.display = 'none'; document.getElementById('2408.10680v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.02085">arXiv:2408.02085</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.02085">pdf</a>, <a href="https://arxiv.org/format/2408.02085">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Unleashing the Power of Data Tsunami: A Comprehensive Survey on Data Assessment and Selection for Instruction Tuning of Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Qin%2C+Y">Yulei Qin</a>, <a href="/search/eess?searchtype=author&amp;query=Yang%2C+Y">Yuncheng Yang</a>, <a href="/search/eess?searchtype=author&amp;query=Guo%2C+P">Pengcheng Guo</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+G">Gang Li</a>, <a href="/search/eess?searchtype=author&amp;query=Shao%2C+H">Hang Shao</a>, <a href="/search/eess?searchtype=author&amp;query=Shi%2C+Y">Yuchen Shi</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+Z">Zihan Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Gu%2C+Y">Yun Gu</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+K">Ke Li</a>, <a href="/search/eess?searchtype=author&amp;query=Sun%2C+X">Xing Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.02085v5-abstract-short" style="display: inline;"> Instruction tuning plays a critical role in aligning large language models (LLMs) with human preference. Despite the vast amount of open instruction datasets, naively training a LLM on all existing instructions may not be optimal and practical. To pinpoint the most beneficial datapoints, data assessment and selection methods have been proposed in the fields of natural language processing (NLP) and&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.02085v5-abstract-full').style.display = 'inline'; document.getElementById('2408.02085v5-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.02085v5-abstract-full" style="display: none;"> Instruction tuning plays a critical role in aligning large language models (LLMs) with human preference. Despite the vast amount of open instruction datasets, naively training a LLM on all existing instructions may not be optimal and practical. To pinpoint the most beneficial datapoints, data assessment and selection methods have been proposed in the fields of natural language processing (NLP) and deep learning. However, under the context of instruction tuning, there still exists a gap in knowledge on what kind of data evaluation metrics can be employed and how they can be integrated into the selection mechanism. To bridge this gap, we present a comprehensive review on existing literature of data assessment and selection especially for instruction tuning of LLMs. We systematically categorize all applicable methods into quality-based, diversity-based, and importance-based ones where a unified, fine-grained taxonomy is structured. For each category, representative methods are elaborated to describe the landscape of relevant research. In addition, comparison between the latest methods is conducted on their officially reported results to provide in-depth discussions on their limitations. Finally, we summarize the open challenges and propose the promosing avenues for future studies. All related contents are available at https://github.com/yuleiqin/fantastic-data-engineering. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.02085v5-abstract-full').style.display = 'none'; document.getElementById('2408.02085v5-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to TMLR with Survey Certificate, review, survey, 37 pages, 5 figures, 4 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.11629">arXiv:2407.11629</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.11629">pdf</a>, <a href="https://arxiv.org/format/2407.11629">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> MUSA: Multi-lingual Speaker Anonymization via Serial Disentanglement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Yao%2C+J">Jixun Yao</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+Q">Qing Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Guo%2C+P">Pengcheng Guo</a>, <a href="/search/eess?searchtype=author&amp;query=Ning%2C+Z">Ziqian Ning</a>, <a href="/search/eess?searchtype=author&amp;query=Yang%2C+Y">Yuguang Yang</a>, <a href="/search/eess?searchtype=author&amp;query=Pan%2C+Y">Yu Pan</a>, <a href="/search/eess?searchtype=author&amp;query=Xie%2C+L">Lei Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.11629v1-abstract-short" style="display: inline;"> Speaker anonymization is an effective privacy protection solution designed to conceal the speaker&#39;s identity while preserving the linguistic content and para-linguistic information of the original speech. While most prior studies focus solely on a single language, an ideal speaker anonymization system should be capable of handling multiple languages. This paper proposes MUSA, a Multi-lingual Speak&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.11629v1-abstract-full').style.display = 'inline'; document.getElementById('2407.11629v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.11629v1-abstract-full" style="display: none;"> Speaker anonymization is an effective privacy protection solution designed to conceal the speaker&#39;s identity while preserving the linguistic content and para-linguistic information of the original speech. While most prior studies focus solely on a single language, an ideal speaker anonymization system should be capable of handling multiple languages. This paper proposes MUSA, a Multi-lingual Speaker Anonymization approach that employs a serial disentanglement strategy to perform a step-by-step disentanglement from a global time-invariant representation to a temporal time-variant representation. By utilizing semantic distillation and self-supervised speaker distillation, the serial disentanglement strategy can avoid strong inductive biases and exhibit superior generalization performance across different languages. Meanwhile, we propose a straightforward anonymization strategy that employs empty embedding with zero values to simulate the speaker identity concealment process, eliminating the need for conversion to a pseudo-speaker identity and thereby reducing the complexity of speaker anonymization process. Experimental results on VoicePrivacy official datasets and multi-lingual datasets demonstrate that MUSA can effectively protect speaker privacy while preserving linguistic content and para-linguistic information. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.11629v1-abstract-full').style.display = 'none'; document.getElementById('2407.11629v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to TASLP</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.03307">arXiv:2407.03307</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.03307">pdf</a>, <a href="https://arxiv.org/format/2407.03307">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> HoloHisto: End-to-end Gigapixel WSI Segmentation with 4K Resolution Sequential Tokenization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Tang%2C+Y">Yucheng Tang</a>, <a href="/search/eess?searchtype=author&amp;query=He%2C+Y">Yufan He</a>, <a href="/search/eess?searchtype=author&amp;query=Nath%2C+V">Vishwesh Nath</a>, <a href="/search/eess?searchtype=author&amp;query=Guo%2C+P">Pengfeig Guo</a>, <a href="/search/eess?searchtype=author&amp;query=Deng%2C+R">Ruining Deng</a>, <a href="/search/eess?searchtype=author&amp;query=Yao%2C+T">Tianyuan Yao</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+Q">Quan Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Cui%2C+C">Can Cui</a>, <a href="/search/eess?searchtype=author&amp;query=Yin%2C+M">Mengmeng Yin</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+Z">Ziyue Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Roth%2C+H">Holger Roth</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+D">Daguang Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Yang%2C+H">Haichun Yang</a>, <a href="/search/eess?searchtype=author&amp;query=Huo%2C+Y">Yuankai Huo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.03307v1-abstract-short" style="display: inline;"> In digital pathology, the traditional method for deep learning-based image segmentation typically involves a two-stage process: initially segmenting high-resolution whole slide images (WSI) into smaller patches (e.g., 256x256, 512x512, 1024x1024) and subsequently reconstructing them to their original scale. This method often struggles to capture the complex details and vast scope of WSIs. In this&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.03307v1-abstract-full').style.display = 'inline'; document.getElementById('2407.03307v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.03307v1-abstract-full" style="display: none;"> In digital pathology, the traditional method for deep learning-based image segmentation typically involves a two-stage process: initially segmenting high-resolution whole slide images (WSI) into smaller patches (e.g., 256x256, 512x512, 1024x1024) and subsequently reconstructing them to their original scale. This method often struggles to capture the complex details and vast scope of WSIs. In this paper, we propose the holistic histopathology (HoloHisto) segmentation method to achieve end-to-end segmentation on gigapixel WSIs, whose maximum resolution is above 80,000$\times$70,000 pixels. HoloHisto fundamentally shifts the paradigm of WSI segmentation to an end-to-end learning fashion with 1) a large (4K) resolution base patch for elevated visual information inclusion and efficient processing, and 2) a novel sequential tokenization mechanism to properly model the contextual relationships and efficiently model the rich information from the 4K input. To our best knowledge, HoloHisto presents the first holistic approach for gigapixel resolution WSI segmentation, supporting direct I/O of complete WSI and their corresponding gigapixel masks. Under the HoloHisto platform, we unveil a random 4K sampler that transcends ultra-high resolution, delivering 31 and 10 times more pixels than standard 2D and 3D patches, respectively, for advancing computational capabilities. To facilitate efficient 4K resolution dense prediction, we leverage sequential tokenization, utilizing a pre-trained image tokenizer to group image features into a discrete token grid. To assess the performance, our team curated a new kidney pathology image segmentation (KPIs) dataset with WSI-level glomeruli segmentation from whole mouse kidneys. From the results, HoloHisto-4K delivers remarkable performance gains over previous state-of-the-art models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.03307v1-abstract-full').style.display = 'none'; document.getElementById('2407.03307v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.19366">arXiv:2405.19366</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2405.19366">pdf</a>, <a href="https://arxiv.org/format/2405.19366">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> ECG Semantic Integrator (ESI): A Foundation ECG Model Pretrained with LLM-Enhanced Cardiological Text </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Yu%2C+H">Han Yu</a>, <a href="/search/eess?searchtype=author&amp;query=Guo%2C+P">Peikun Guo</a>, <a href="/search/eess?searchtype=author&amp;query=Sano%2C+A">Akane Sano</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.19366v2-abstract-short" style="display: inline;"> The utilization of deep learning on electrocardiogram (ECG) analysis has brought the advanced accuracy and efficiency of cardiac healthcare diagnostics. By leveraging the capabilities of deep learning in semantic understanding, especially in feature extraction and representation learning, this study introduces a new multimodal contrastive pretaining framework that aims to improve the quality and r&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.19366v2-abstract-full').style.display = 'inline'; document.getElementById('2405.19366v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.19366v2-abstract-full" style="display: none;"> The utilization of deep learning on electrocardiogram (ECG) analysis has brought the advanced accuracy and efficiency of cardiac healthcare diagnostics. By leveraging the capabilities of deep learning in semantic understanding, especially in feature extraction and representation learning, this study introduces a new multimodal contrastive pretaining framework that aims to improve the quality and robustness of learned representations of 12-lead ECG signals. Our framework comprises two key components, including Cardio Query Assistant (CQA) and ECG Semantics Integrator(ESI). CQA integrates a retrieval-augmented generation (RAG) pipeline to leverage large language models (LLMs) and external medical knowledge to generate detailed textual descriptions of ECGs. The generated text is enriched with information about demographics and waveform patterns. ESI integrates both contrastive and captioning loss to pretrain ECG encoders for enhanced representations. We validate our approach through various downstream tasks, including arrhythmia detection and ECG-based subject identification. Our experimental results demonstrate substantial improvements over strong baselines in these tasks. These baselines encompass supervised and self-supervised learning methods, as well as prior multimodal pretraining approaches. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.19366v2-abstract-full').style.display = 'none'; document.getElementById('2405.19366v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.10786">arXiv:2405.10786</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2405.10786">pdf</a>, <a href="https://arxiv.org/format/2405.10786">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Distinctive and Natural Speaker Anonymization via Singular Value Transformation-assisted Matrix </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Yao%2C+J">Jixun Yao</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+Q">Qing Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Guo%2C+P">Pengcheng Guo</a>, <a href="/search/eess?searchtype=author&amp;query=Ning%2C+Z">Ziqian Ning</a>, <a href="/search/eess?searchtype=author&amp;query=Xie%2C+L">Lei Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.10786v1-abstract-short" style="display: inline;"> Speaker anonymization is an effective privacy protection solution that aims to conceal the speaker&#39;s identity while preserving the naturalness and distinctiveness of the original speech. Mainstream approaches use an utterance-level vector from a pre-trained automatic speaker verification (ASV) model to represent speaker identity, which is then averaged or modified for anonymization. However, these&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.10786v1-abstract-full').style.display = 'inline'; document.getElementById('2405.10786v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.10786v1-abstract-full" style="display: none;"> Speaker anonymization is an effective privacy protection solution that aims to conceal the speaker&#39;s identity while preserving the naturalness and distinctiveness of the original speech. Mainstream approaches use an utterance-level vector from a pre-trained automatic speaker verification (ASV) model to represent speaker identity, which is then averaged or modified for anonymization. However, these systems suffer from deterioration in the naturalness of anonymized speech, degradation in speaker distinctiveness, and severe privacy leakage against powerful attackers. To address these issues and especially generate more natural and distinctive anonymized speech, we propose a novel speaker anonymization approach that models a matrix related to speaker identity and transforms it into an anonymized singular value transformation-assisted matrix to conceal the original speaker identity. Our approach extracts frame-level speaker vectors from a pre-trained ASV model and employs an attention mechanism to create a speaker-score matrix and speaker-related tokens. Notably, the speaker-score matrix acts as the weight for the corresponding speaker-related token, representing the speaker&#39;s identity. The singular value transformation-assisted matrix is generated by recomposing the decomposed orthonormal eigenvectors matrix and non-linear transformed singular through Singular Value Decomposition (SVD). Experiments on VoicePrivacy Challenge datasets demonstrate the effectiveness of our approach in protecting speaker privacy under all attack scenarios while maintaining speech naturalness and distinctiveness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.10786v1-abstract-full').style.display = 'none'; document.getElementById('2405.10786v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by IEEE/ACM Transactions on Audio, Speech, and Language Processing</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.02132">arXiv:2405.02132</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2405.02132">pdf</a>, <a href="https://arxiv.org/format/2405.02132">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Unveiling the Potential of LLM-Based ASR on Chinese Open-Source Datasets </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Geng%2C+X">Xuelong Geng</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+T">Tianyi Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Wei%2C+K">Kun Wei</a>, <a href="/search/eess?searchtype=author&amp;query=Mu%2C+B">Bingshen Mu</a>, <a href="/search/eess?searchtype=author&amp;query=Xue%2C+H">Hongfei Xue</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+H">He Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+Y">Yangze Li</a>, <a href="/search/eess?searchtype=author&amp;query=Guo%2C+P">Pengcheng Guo</a>, <a href="/search/eess?searchtype=author&amp;query=Dai%2C+Y">Yuhang Dai</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+L">Longhao Li</a>, <a href="/search/eess?searchtype=author&amp;query=Shao%2C+M">Mingchen Shao</a>, <a href="/search/eess?searchtype=author&amp;query=Xie%2C+L">Lei Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.02132v3-abstract-short" style="display: inline;"> Large Language Models (LLMs) have demonstrated unparalleled effectiveness in various NLP tasks, and integrating LLMs with automatic speech recognition (ASR) is becoming a mainstream paradigm. Building upon this momentum, our research delves into an in-depth examination of this paradigm on a large open-source Chinese dataset. Specifically, our research aims to evaluate the impact of various configu&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.02132v3-abstract-full').style.display = 'inline'; document.getElementById('2405.02132v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.02132v3-abstract-full" style="display: none;"> Large Language Models (LLMs) have demonstrated unparalleled effectiveness in various NLP tasks, and integrating LLMs with automatic speech recognition (ASR) is becoming a mainstream paradigm. Building upon this momentum, our research delves into an in-depth examination of this paradigm on a large open-source Chinese dataset. Specifically, our research aims to evaluate the impact of various configurations of speech encoders, LLMs, and projector modules in the context of the speech foundation encoder-LLM ASR paradigm. Furthermore, we introduce a three-stage training approach, expressly developed to enhance the model&#39;s ability to align auditory and textual information. The implementation of this approach, alongside the strategic integration of ASR components, enabled us to achieve the SOTA performance on the AISHELL-1, Test_Net, and Test_Meeting test sets. Our analysis presents an empirical foundation for future research in LLM-based ASR systems and offers insights into optimizing performance using Chinese datasets. We will publicly release all scripts used for data preparation, training, inference, and scoring, as well as pre-trained models and training logs to promote reproducible research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.02132v3-abstract-full').style.display = 'none'; document.getElementById('2405.02132v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.00259">arXiv:2405.00259</a> <span>&nbsp;&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Medical Physics">physics.med-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Optimization of Dark-Field CT for Lung Imaging </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Guo%2C+P">Peiyuan Guo</a>, <a href="/search/eess?searchtype=author&amp;query=Spindler%2C+S">Simon Spindler</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+L">Li Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+Z">Zhentian Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.00259v2-abstract-short" style="display: inline;"> Background: X-ray grating-based dark-field imaging can sense the small angle scattering caused by an object&#39;s micro-structure. This technique is sensitive to lung&#39;s porous alveoli and is able to detect lung disease at an early stage. Up to now, a human-scale dark-field CT has been built for lung imaging. Purpose: This study aimed to develop a more thorough optimization method for dark-field lung C&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.00259v2-abstract-full').style.display = 'inline'; document.getElementById('2405.00259v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.00259v2-abstract-full" style="display: none;"> Background: X-ray grating-based dark-field imaging can sense the small angle scattering caused by an object&#39;s micro-structure. This technique is sensitive to lung&#39;s porous alveoli and is able to detect lung disease at an early stage. Up to now, a human-scale dark-field CT has been built for lung imaging. Purpose: This study aimed to develop a more thorough optimization method for dark-field lung CT and summarize principles for system design. Methods: We proposed a metric in the form of contrast-to-noise ratio (CNR) for system parameter optimization, and designed a phantom with concentric circle shape to fit the task of lung disease detection. Finally, we developed the calculation method of the CNR metric, and analyzed the relation between CNR and system parameters. Results: We showed that with other parameters held constant, the CNR first increases and then decreases with the system auto-correlation length (ACL). The optimal ACL is nearly not influenced by system&#39;s visibility, and is only related to phantom&#39;s property, i.e., scattering material&#39;s size and phantom&#39;s absorption. For our phantom, the optimal ACL is about 0.21 渭m. As for system geometry, larger source-detector and isocenter-detector distance can increase the system&#39;s maximal ACL, helping the system meet the optimal ACL more easily. Conclusions: This study proposed a more reasonable metric and a task-based process for optimization, and demonstrated that the system optimal ACL is only related to the phantom&#39;s property. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.00259v2-abstract-full').style.display = 'none'; document.getElementById('2405.00259v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">There is a mistake in subsection 2.3, where the content is not correct because of the incorrect parameter we set, which leads to the following calculations in the following sections potentially incorrect</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.06788">arXiv:2401.06788</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2401.06788">pdf</a>, <a href="https://arxiv.org/format/2401.06788">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> The NPU-ASLP-LiAuto System Description for Visual Speech Recognition in CNVSRC 2023 </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Wang%2C+H">He Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Guo%2C+P">Pengcheng Guo</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+W">Wei Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Zhou%2C+P">Pan Zhou</a>, <a href="/search/eess?searchtype=author&amp;query=Xie%2C+L">Lei Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.06788v2-abstract-short" style="display: inline;"> This paper delineates the visual speech recognition (VSR) system introduced by the NPU-ASLP-LiAuto (Team 237) in the first Chinese Continuous Visual Speech Recognition Challenge (CNVSRC) 2023, engaging in the fixed and open tracks of Single-Speaker VSR Task, and the open track of Multi-Speaker VSR Task. In terms of data processing, we leverage the lip motion extractor from the baseline1 to produce&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.06788v2-abstract-full').style.display = 'inline'; document.getElementById('2401.06788v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.06788v2-abstract-full" style="display: none;"> This paper delineates the visual speech recognition (VSR) system introduced by the NPU-ASLP-LiAuto (Team 237) in the first Chinese Continuous Visual Speech Recognition Challenge (CNVSRC) 2023, engaging in the fixed and open tracks of Single-Speaker VSR Task, and the open track of Multi-Speaker VSR Task. In terms of data processing, we leverage the lip motion extractor from the baseline1 to produce multi-scale video data. Besides, various augmentation techniques are applied during training, encompassing speed perturbation, random rotation, horizontal flipping, and color transformation. The VSR model adopts an end-to-end architecture with joint CTC/attention loss, comprising a ResNet3D visual frontend, an E-Branchformer encoder, and a Transformer decoder. Experiments show that our system achieves 34.76% CER for the Single-Speaker Task and 41.06% CER for the Multi-Speaker Task after multi-system fusion, ranking first place in all three tracks we participate. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.06788v2-abstract-full').style.display = 'none'; document.getElementById('2401.06788v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Included in CNVSRC Workshop 2023, NCMMSC 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.04148">arXiv:2401.04148</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2401.04148">pdf</a>, <a href="https://arxiv.org/format/2401.04148">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Online Test-Time Adaptation of Spatial-Temporal Traffic Flow Forecasting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Guo%2C+P">Pengxin Guo</a>, <a href="/search/eess?searchtype=author&amp;query=Jin%2C+P">Pengrong Jin</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+Z">Ziyue Li</a>, <a href="/search/eess?searchtype=author&amp;query=Bai%2C+L">Lei Bai</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+Y">Yu Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.04148v1-abstract-short" style="display: inline;"> Accurate spatial-temporal traffic flow forecasting is crucial in aiding traffic managers in implementing control measures and assisting drivers in selecting optimal travel routes. Traditional deep-learning based methods for traffic flow forecasting typically rely on historical data to train their models, which are then used to make predictions on future data. However, the performance of the traine&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.04148v1-abstract-full').style.display = 'inline'; document.getElementById('2401.04148v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.04148v1-abstract-full" style="display: none;"> Accurate spatial-temporal traffic flow forecasting is crucial in aiding traffic managers in implementing control measures and assisting drivers in selecting optimal travel routes. Traditional deep-learning based methods for traffic flow forecasting typically rely on historical data to train their models, which are then used to make predictions on future data. However, the performance of the trained model usually degrades due to the temporal drift between the historical and future data. To make the model trained on historical data better adapt to future data in a fully online manner, this paper conducts the first study of the online test-time adaptation techniques for spatial-temporal traffic flow forecasting problems. To this end, we propose an Adaptive Double Correction by Series Decomposition (ADCSD) method, which first decomposes the output of the trained model into seasonal and trend-cyclical parts and then corrects them by two separate modules during the testing phase using the latest observed data entry by entry. In the proposed ADCSD method, instead of fine-tuning the whole trained model during the testing phase, a lite network is attached after the trained model, and only the lite network is fine-tuned in the testing process each time a data entry is observed. Moreover, to satisfy that different time series variables may have different levels of temporal drift, two adaptive vectors are adopted to provide different weights for different time series variables. Extensive experiments on four real-world traffic flow forecasting datasets demonstrate the effectiveness of the proposed ADCSD method. The code is available at https://github.com/Pengxin-Guo/ADCSD. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.04148v1-abstract-full').style.display = 'none'; document.getElementById('2401.04148v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.03697">arXiv:2401.03697</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2401.03697">pdf</a>, <a href="https://arxiv.org/format/2401.03697">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> An audio-quality-based multi-strategy approach for target speaker extraction in the MISP 2023 Challenge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Han%2C+R">Runduo Han</a>, <a href="/search/eess?searchtype=author&amp;query=Yan%2C+X">Xiaopeng Yan</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+W">Weiming Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Guo%2C+P">Pengcheng Guo</a>, <a href="/search/eess?searchtype=author&amp;query=Sun%2C+J">Jiayao Sun</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+H">He Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Lu%2C+Q">Quan Lu</a>, <a href="/search/eess?searchtype=author&amp;query=Jiang%2C+N">Ning Jiang</a>, <a href="/search/eess?searchtype=author&amp;query=Xie%2C+L">Lei Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.03697v2-abstract-short" style="display: inline;"> This paper describes our audio-quality-based multi-strategy approach for the audio-visual target speaker extraction (AVTSE) task in the Multi-modal Information based Speech Processing (MISP) 2023 Challenge. Specifically, our approach adopts different extraction strategies based on the audio quality, striking a balance between interference removal and speech preservation, which benifits the back-en&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.03697v2-abstract-full').style.display = 'inline'; document.getElementById('2401.03697v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.03697v2-abstract-full" style="display: none;"> This paper describes our audio-quality-based multi-strategy approach for the audio-visual target speaker extraction (AVTSE) task in the Multi-modal Information based Speech Processing (MISP) 2023 Challenge. Specifically, our approach adopts different extraction strategies based on the audio quality, striking a balance between interference removal and speech preservation, which benifits the back-end automatic speech recognition (ASR) systems. Experiments show that our approach achieves a character error rate (CER) of 24.2% and 33.2% on the Dev and Eval set, respectively, obtaining the second place in the challenge. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.03697v2-abstract-full').style.display = 'none'; document.getElementById('2401.03697v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICASSP 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.03473">arXiv:2401.03473</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2401.03473">pdf</a>, <a href="https://arxiv.org/ps/2401.03473">ps</a>, <a href="https://arxiv.org/format/2401.03473">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> ICMC-ASR: The ICASSP 2024 In-Car Multi-Channel Automatic Speech Recognition Challenge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Wang%2C+H">He Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Guo%2C+P">Pengcheng Guo</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+Y">Yue Li</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+A">Ao Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Sun%2C+J">Jiayao Sun</a>, <a href="/search/eess?searchtype=author&amp;query=Xie%2C+L">Lei Xie</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+W">Wei Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Zhou%2C+P">Pan Zhou</a>, <a href="/search/eess?searchtype=author&amp;query=Bu%2C+H">Hui Bu</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+X">Xin Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+B">Binbin Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+Z">Zhuo Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Wu%2C+J">Jian Wu</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+L">Longbiao Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Chng%2C+E+S">Eng Siong Chng</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+S">Sun Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.03473v3-abstract-short" style="display: inline;"> To promote speech processing and recognition research in driving scenarios, we build on the success of the Intelligent Cockpit Speech Recognition Challenge (ICSRC) held at ISCSLP 2022 and launch the ICASSP 2024 In-Car Multi-Channel Automatic Speech Recognition (ICMC-ASR) Challenge. This challenge collects over 100 hours of multi-channel speech data recorded inside a new energy vehicle and 40 hours&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.03473v3-abstract-full').style.display = 'inline'; document.getElementById('2401.03473v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.03473v3-abstract-full" style="display: none;"> To promote speech processing and recognition research in driving scenarios, we build on the success of the Intelligent Cockpit Speech Recognition Challenge (ICSRC) held at ISCSLP 2022 and launch the ICASSP 2024 In-Car Multi-Channel Automatic Speech Recognition (ICMC-ASR) Challenge. This challenge collects over 100 hours of multi-channel speech data recorded inside a new energy vehicle and 40 hours of noise for data augmentation. Two tracks, including automatic speech recognition (ASR) and automatic speech diarization and recognition (ASDR) are set up, using character error rate (CER) and concatenated minimum permutation character error rate (cpCER) as evaluation metrics, respectively. Overall, the ICMC-ASR Challenge attracts 98 participating teams and receives 53 valid results in both tracks. In the end, first-place team USTCiflytek achieves a CER of 13.16% in the ASR track and a cpCER of 21.48% in the ASDR track, showing an absolute improvement of 13.08% and 51.4% compared to our challenge baseline, respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.03473v3-abstract-full').style.display = 'none'; document.getElementById('2401.03473v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at ICASSP 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.03424">arXiv:2401.03424</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2401.03424">pdf</a>, <a href="https://arxiv.org/format/2401.03424">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/ICASSP48485.2024.10446769">10.1109/ICASSP48485.2024.10446769 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> MLCA-AVSR: Multi-Layer Cross Attention Fusion based Audio-Visual Speech Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Wang%2C+H">He Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Guo%2C+P">Pengcheng Guo</a>, <a href="/search/eess?searchtype=author&amp;query=Zhou%2C+P">Pan Zhou</a>, <a href="/search/eess?searchtype=author&amp;query=Xie%2C+L">Lei Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.03424v3-abstract-short" style="display: inline;"> While automatic speech recognition (ASR) systems degrade significantly in noisy environments, audio-visual speech recognition (AVSR) systems aim to complement the audio stream with noise-invariant visual cues and improve the system&#39;s robustness. However, current studies mainly focus on fusing the well-learned modality features, like the output of modality-specific encoders, without considering the&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.03424v3-abstract-full').style.display = 'inline'; document.getElementById('2401.03424v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.03424v3-abstract-full" style="display: none;"> While automatic speech recognition (ASR) systems degrade significantly in noisy environments, audio-visual speech recognition (AVSR) systems aim to complement the audio stream with noise-invariant visual cues and improve the system&#39;s robustness. However, current studies mainly focus on fusing the well-learned modality features, like the output of modality-specific encoders, without considering the contextual relationship during the modality feature learning. In this study, we propose a multi-layer cross-attention fusion based AVSR (MLCA-AVSR) approach that promotes representation learning of each modality by fusing them at different levels of audio/visual encoders. Experimental results on the MISP2022-AVSR Challenge dataset show the efficacy of our proposed system, achieving a concatenated minimum permutation character error rate (cpCER) of 30.57% on the Eval set and yielding up to 3.17% relative improvement compared with our previous system which ranked the second place in the challenge. Following the fusion of multiple systems, our proposed approach surpasses the first-place system, establishing a new SOTA cpCER of 29.13% on this dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.03424v3-abstract-full').style.display = 'none'; document.getElementById('2401.03424v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 3 figures Accepted at ICASSP 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.09746">arXiv:2312.09746</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2312.09746">pdf</a>, <a href="https://arxiv.org/format/2312.09746">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Automatic channel selection and spatial feature integration for multi-channel speech recognition across various array topologies </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Mu%2C+B">Bingshen Mu</a>, <a href="/search/eess?searchtype=author&amp;query=Guo%2C+P">Pengcheng Guo</a>, <a href="/search/eess?searchtype=author&amp;query=Guo%2C+D">Dake Guo</a>, <a href="/search/eess?searchtype=author&amp;query=Zhou%2C+P">Pan Zhou</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+W">Wei Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Xie%2C+L">Lei Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.09746v1-abstract-short" style="display: inline;"> Automatic Speech Recognition (ASR) has shown remarkable progress, yet it still faces challenges in real-world distant scenarios across various array topologies each with multiple recording devices. The focal point of the CHiME-7 Distant ASR task is to devise a unified system capable of generalizing various array topologies that have multiple recording devices and offering reliable recognition perf&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.09746v1-abstract-full').style.display = 'inline'; document.getElementById('2312.09746v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.09746v1-abstract-full" style="display: none;"> Automatic Speech Recognition (ASR) has shown remarkable progress, yet it still faces challenges in real-world distant scenarios across various array topologies each with multiple recording devices. The focal point of the CHiME-7 Distant ASR task is to devise a unified system capable of generalizing various array topologies that have multiple recording devices and offering reliable recognition performance in real-world environments. Addressing this task, we introduce an ASR system that demonstrates exceptional performance across various array topologies. First of all, we propose two attention-based automatic channel selection modules to select the most advantageous subset of multi-channel signals from multiple recording devices for each utterance. Furthermore, we introduce inter-channel spatial features to augment the effectiveness of multi-frame cross-channel attention, aiding it in improving the capability of spatial information awareness. Finally, we propose a multi-layer convolution fusion module drawing inspiration from the U-Net architecture to integrate the multi-channel output into a single-channel output. Experimental results on the CHiME-7 corpus with oracle segmentation demonstrate that the improvements introduced in our proposed ASR system lead to a relative reduction of 40.1% in the Macro Diarization Attributed Word Error Rates (DA-WER) when compared to the baseline ASR system on the Eval sets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.09746v1-abstract-full').style.display = 'none'; document.getElementById('2312.09746v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICASSP 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.07062">arXiv:2311.07062</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2311.07062">pdf</a>, <a href="https://arxiv.org/format/2311.07062">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TASLP.2023.3332542">10.1109/TASLP.2023.3332542 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Decoupling and Interacting Multi-Task Learning Network for Joint Speech and Accent Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Shao%2C+Q">Qijie Shao</a>, <a href="/search/eess?searchtype=author&amp;query=Guo%2C+P">Pengcheng Guo</a>, <a href="/search/eess?searchtype=author&amp;query=Yan%2C+J">Jinghao Yan</a>, <a href="/search/eess?searchtype=author&amp;query=Hu%2C+P">Pengfei Hu</a>, <a href="/search/eess?searchtype=author&amp;query=Xie%2C+L">Lei Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.07062v2-abstract-short" style="display: inline;"> Accents, as variations from standard pronunciation, pose significant challenges for speech recognition systems. Although joint automatic speech recognition (ASR) and accent recognition (AR) training has been proven effective in handling multi-accent scenarios, current multi-task ASR-AR approaches overlook the granularity differences between tasks. Fine-grained units capture pronunciation-related a&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.07062v2-abstract-full').style.display = 'inline'; document.getElementById('2311.07062v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.07062v2-abstract-full" style="display: none;"> Accents, as variations from standard pronunciation, pose significant challenges for speech recognition systems. Although joint automatic speech recognition (ASR) and accent recognition (AR) training has been proven effective in handling multi-accent scenarios, current multi-task ASR-AR approaches overlook the granularity differences between tasks. Fine-grained units capture pronunciation-related accent characteristics, while coarse-grained units are better for learning linguistic information. Moreover, an explicit interaction of two tasks can also provide complementary information and improve the performance of each other, but it is rarely used by existing approaches. In this paper, we propose a novel Decoupling and Interacting Multi-task Network (DIMNet) for joint speech and accent recognition, which is comprised of a connectionist temporal classification (CTC) branch, an AR branch, an ASR branch, and a bottom feature encoder. Specifically, AR and ASR are first decoupled by separated branches and two-granular modeling units to learn task-specific representations. The AR branch is from our previously proposed linguistic-acoustic bimodal AR model and the ASR branch is an encoder-decoder based Conformer model. Then, for the task interaction, the CTC branch provides aligned text for the AR task, while accent embeddings extracted from our AR model are incorporated into the ASR branch&#39;s encoder and decoder. Finally, during ASR inference, a cross-granular rescoring method is introduced to fuse the complementary information from the CTC and attention decoder after the decoupling. Our experiments on English and Chinese datasets demonstrate the effectiveness of the proposed model, which achieves 21.45%/28.53% AR accuracy relative improvement and 32.33%/14.55% ASR error rate relative reduction over a published standard baseline, respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.07062v2-abstract-full').style.display = 'none'; document.getElementById('2311.07062v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by IEEE Transactions on Audio, Speech and Language Processing (TASLP)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.15930">arXiv:2310.15930</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2310.15930">pdf</a>, <a href="https://arxiv.org/format/2310.15930">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.21437/Interspeech.2024-1597">10.21437/Interspeech.2024-1597 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> CDSD: Chinese Dysarthria Speech Database </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Wang%2C+Y">Yan Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Sun%2C+M">Mengyi Sun</a>, <a href="/search/eess?searchtype=author&amp;query=Kang%2C+X">Xinchen Kang</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+J">Jingting Li</a>, <a href="/search/eess?searchtype=author&amp;query=Guo%2C+P">Pengfei Guo</a>, <a href="/search/eess?searchtype=author&amp;query=Gao%2C+M">Ming Gao</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+S">Su-Jing Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.15930v2-abstract-short" style="display: inline;"> Dysarthric speech poses significant challenges for individuals with dysarthria, impacting their ability to communicate socially. Despite the widespread use of Automatic Speech Recognition (ASR), accurately recognizing dysarthric speech remains a formidable task, largely due to the limited availability of dysarthric speech data. To address this gap, we developed the Chinese Dysarthria Speech Databa&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.15930v2-abstract-full').style.display = 'inline'; document.getElementById('2310.15930v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.15930v2-abstract-full" style="display: none;"> Dysarthric speech poses significant challenges for individuals with dysarthria, impacting their ability to communicate socially. Despite the widespread use of Automatic Speech Recognition (ASR), accurately recognizing dysarthric speech remains a formidable task, largely due to the limited availability of dysarthric speech data. To address this gap, we developed the Chinese Dysarthria Speech Database (CDSD), the most extensive collection of Chinese dysarthria data to date, featuring 133 hours of recordings from 44 speakers. Our benchmarks reveal a best Character Error Rate (CER) of 16.4\%. Compared to the CER of 20.45\% from our additional human experiments, Dysarthric Speech Recognition (DSR) demonstrates its potential in significant improvement of communication for individuals with dysarthria. The CDSD database will be made publicly available at http://melab.psych.ac.cn/CDSD.html. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.15930v2-abstract-full').style.display = 'none'; document.getElementById('2310.15930v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at INTERSPEECH 2024 Yan Wang and Mengyi Sun contributed equally to this research</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Interspeech 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.04863">arXiv:2310.04863</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2310.04863">pdf</a>, <a href="https://arxiv.org/format/2310.04863">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> SA-Paraformer: Non-autoregressive End-to-End Speaker-Attributed ASR </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Li%2C+Y">Yangze Li</a>, <a href="/search/eess?searchtype=author&amp;query=Yu%2C+F">Fan Yu</a>, <a href="/search/eess?searchtype=author&amp;query=Liang%2C+Y">Yuhao Liang</a>, <a href="/search/eess?searchtype=author&amp;query=Guo%2C+P">Pengcheng Guo</a>, <a href="/search/eess?searchtype=author&amp;query=Shi%2C+M">Mohan Shi</a>, <a href="/search/eess?searchtype=author&amp;query=Du%2C+Z">Zhihao Du</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+S">Shiliang Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Xie%2C+L">Lei Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.04863v1-abstract-short" style="display: inline;"> Joint modeling of multi-speaker ASR and speaker diarization has recently shown promising results in speaker-attributed automatic speech recognition (SA-ASR).Although being able to obtain state-of-the-art (SOTA) performance, most of the studies are based on an autoregressive (AR) decoder which generates tokens one-by-one and results in a large real-time factor (RTF). To speed up inference, we intro&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.04863v1-abstract-full').style.display = 'inline'; document.getElementById('2310.04863v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.04863v1-abstract-full" style="display: none;"> Joint modeling of multi-speaker ASR and speaker diarization has recently shown promising results in speaker-attributed automatic speech recognition (SA-ASR).Although being able to obtain state-of-the-art (SOTA) performance, most of the studies are based on an autoregressive (AR) decoder which generates tokens one-by-one and results in a large real-time factor (RTF). To speed up inference, we introduce a recently proposed non-autoregressive model Paraformer as an acoustic model in the SA-ASR model.Paraformer uses a single-step decoder to enable parallel generation, obtaining comparable performance to the SOTA AR transformer models. Besides, we propose a speaker-filling strategy to reduce speaker identification errors and adopt an inter-CTC strategy to enhance the encoder&#39;s ability in acoustic modeling. Experiments on the AliMeeting corpus show that our model outperforms the cascaded SA-ASR model by a 6.1% relative speaker-dependent character error rate (SD-CER) reduction on the test set. Moreover, our model achieves a comparable SD-CER of 34.8% with only 1/10 RTF compared with the SOTA joint AR SA-ASR model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.04863v1-abstract-full').style.display = 'none'; document.getElementById('2310.04863v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.15800">arXiv:2309.15800</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2309.15800">pdf</a>, <a href="https://arxiv.org/format/2309.15800">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Exploring Speech Recognition, Translation, and Understanding with Discrete Speech Units: A Comparative Study </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Chang%2C+X">Xuankai Chang</a>, <a href="/search/eess?searchtype=author&amp;query=Yan%2C+B">Brian Yan</a>, <a href="/search/eess?searchtype=author&amp;query=Choi%2C+K">Kwanghee Choi</a>, <a href="/search/eess?searchtype=author&amp;query=Jung%2C+J">Jeeweon Jung</a>, <a href="/search/eess?searchtype=author&amp;query=Lu%2C+Y">Yichen Lu</a>, <a href="/search/eess?searchtype=author&amp;query=Maiti%2C+S">Soumi Maiti</a>, <a href="/search/eess?searchtype=author&amp;query=Sharma%2C+R">Roshan Sharma</a>, <a href="/search/eess?searchtype=author&amp;query=Shi%2C+J">Jiatong Shi</a>, <a href="/search/eess?searchtype=author&amp;query=Tian%2C+J">Jinchuan Tian</a>, <a href="/search/eess?searchtype=author&amp;query=Watanabe%2C+S">Shinji Watanabe</a>, <a href="/search/eess?searchtype=author&amp;query=Fujita%2C+Y">Yuya Fujita</a>, <a href="/search/eess?searchtype=author&amp;query=Maekaku%2C+T">Takashi Maekaku</a>, <a href="/search/eess?searchtype=author&amp;query=Guo%2C+P">Pengcheng Guo</a>, <a href="/search/eess?searchtype=author&amp;query=Cheng%2C+Y">Yao-Fei Cheng</a>, <a href="/search/eess?searchtype=author&amp;query=Denisov%2C+P">Pavel Denisov</a>, <a href="/search/eess?searchtype=author&amp;query=Saijo%2C+K">Kohei Saijo</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+H">Hsiu-Hsuan Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.15800v1-abstract-short" style="display: inline;"> Speech signals, typically sampled at rates in the tens of thousands per second, contain redundancies, evoking inefficiencies in sequence modeling. High-dimensional speech features such as spectrograms are often used as the input for the subsequent model. However, they can still be redundant. Recent investigations proposed the use of discrete speech units derived from self-supervised learning repre&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.15800v1-abstract-full').style.display = 'inline'; document.getElementById('2309.15800v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.15800v1-abstract-full" style="display: none;"> Speech signals, typically sampled at rates in the tens of thousands per second, contain redundancies, evoking inefficiencies in sequence modeling. High-dimensional speech features such as spectrograms are often used as the input for the subsequent model. However, they can still be redundant. Recent investigations proposed the use of discrete speech units derived from self-supervised learning representations, which significantly compresses the size of speech data. Applying various methods, such as de-duplication and subword modeling, can further compress the speech sequence length. Hence, training time is significantly reduced while retaining notable performance. In this study, we undertake a comprehensive and systematic exploration into the application of discrete units within end-to-end speech processing models. Experiments on 12 automatic speech recognition, 3 speech translation, and 1 spoken language understanding corpora demonstrate that discrete units achieve reasonably good results in almost all the settings. We intend to release our configurations and trained models to foster future research efforts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.15800v1-abstract-full').style.display = 'none'; document.getElementById('2309.15800v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to IEEE ICASSP 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.00929">arXiv:2309.00929</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2309.00929">pdf</a>, <a href="https://arxiv.org/format/2309.00929">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Timbre-reserved Adversarial Attack in Speaker Identification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Wang%2C+Q">Qing Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Yao%2C+J">Jixun Yao</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+L">Li Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Guo%2C+P">Pengcheng Guo</a>, <a href="/search/eess?searchtype=author&amp;query=Xie%2C+L">Lei Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.00929v1-abstract-short" style="display: inline;"> As a type of biometric identification, a speaker identification (SID) system is confronted with various kinds of attacks. The spoofing attacks typically imitate the timbre of the target speakers, while the adversarial attacks confuse the SID system by adding a well-designed adversarial perturbation to an arbitrary speech. Although the spoofing attack copies a similar timbre as the victim, it does&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.00929v1-abstract-full').style.display = 'inline'; document.getElementById('2309.00929v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.00929v1-abstract-full" style="display: none;"> As a type of biometric identification, a speaker identification (SID) system is confronted with various kinds of attacks. The spoofing attacks typically imitate the timbre of the target speakers, while the adversarial attacks confuse the SID system by adding a well-designed adversarial perturbation to an arbitrary speech. Although the spoofing attack copies a similar timbre as the victim, it does not exploit the vulnerability of the SID model and may not make the SID system give the attacker&#39;s desired decision. As for the adversarial attack, despite the SID system can be led to a designated decision, it cannot meet the specified text or speaker timbre requirements for the specific attack scenarios. In this study, to make the attack in SID not only leverage the vulnerability of the SID model but also reserve the timbre of the target speaker, we propose a timbre-reserved adversarial attack in the speaker identification. We generate the timbre-reserved adversarial audios by adding an adversarial constraint during the different training stages of the voice conversion (VC) model. Specifically, the adversarial constraint is using the target speaker label to optimize the adversarial perturbation added to the VC model representations and is implemented by a speaker classifier joining in the VC model training. The adversarial constraint can help to control the VC model to generate the speaker-wised audio. Eventually, the inference of the VC model is the ideal adversarial fake audio, which is timbre-reserved and can fool the SID system. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.00929v1-abstract-full').style.display = 'none'; document.getElementById('2309.00929v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 8 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.00804">arXiv:2306.00804</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2306.00804">pdf</a>, <a href="https://arxiv.org/format/2306.00804">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Adaptive Contextual Biasing for Transducer Based Streaming Speech Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Xu%2C+T">Tianyi Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Yang%2C+Z">Zhanheng Yang</a>, <a href="/search/eess?searchtype=author&amp;query=Huang%2C+K">Kaixun Huang</a>, <a href="/search/eess?searchtype=author&amp;query=Guo%2C+P">Pengcheng Guo</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+A">Ao Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+B">Biao Li</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+C">Changru Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+C">Chao Li</a>, <a href="/search/eess?searchtype=author&amp;query=Xie%2C+L">Lei Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.00804v3-abstract-short" style="display: inline;"> By incorporating additional contextual information, deep biasing methods have emerged as a promising solution for speech recognition of personalized words. However, for real-world voice assistants, always biasing on such personalized words with high prediction scores can significantly degrade the performance of recognizing common words. To address this issue, we propose an adaptive contextual bias&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.00804v3-abstract-full').style.display = 'inline'; document.getElementById('2306.00804v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.00804v3-abstract-full" style="display: none;"> By incorporating additional contextual information, deep biasing methods have emerged as a promising solution for speech recognition of personalized words. However, for real-world voice assistants, always biasing on such personalized words with high prediction scores can significantly degrade the performance of recognizing common words. To address this issue, we propose an adaptive contextual biasing method based on Context-Aware Transformer Transducer (CATT) that utilizes the biased encoder and predictor embeddings to perform streaming prediction of contextual phrase occurrences. Such prediction is then used to dynamically switch the bias list on and off, enabling the model to adapt to both personalized and common scenarios. Experiments on Librispeech and internal voice assistant datasets show that our approach can achieve up to 6.7% and 20.7% relative reduction in WER and CER compared to the baseline respectively, mitigating up to 96.7% and 84.9% of the relative WER and CER increase for common cases. Furthermore, our approach has a minimal performance impact in personalized scenarios while maintaining a streaming inference pipeline with negligible RTF increase. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.00804v3-abstract-full').style.display = 'none'; document.getElementById('2306.00804v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.19020">arXiv:2305.19020</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2305.19020">pdf</a>, <a href="https://arxiv.org/format/2305.19020">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Pseudo-Siamese Network based Timbre-reserved Black-box Adversarial Attack in Speaker Identification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Wang%2C+Q">Qing Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Yao%2C+J">Jixun Yao</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+Z">Ziqian Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Guo%2C+P">Pengcheng Guo</a>, <a href="/search/eess?searchtype=author&amp;query=Xie%2C+L">Lei Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.19020v1-abstract-short" style="display: inline;"> In this study, we propose a timbre-reserved adversarial attack approach for speaker identification (SID) to not only exploit the weakness of the SID model but also preserve the timbre of the target speaker in a black-box attack setting. Particularly, we generate timbre-reserved fake audio by adding an adversarial constraint during the training of the voice conversion model. Then, we leverage a pse&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.19020v1-abstract-full').style.display = 'inline'; document.getElementById('2305.19020v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.19020v1-abstract-full" style="display: none;"> In this study, we propose a timbre-reserved adversarial attack approach for speaker identification (SID) to not only exploit the weakness of the SID model but also preserve the timbre of the target speaker in a black-box attack setting. Particularly, we generate timbre-reserved fake audio by adding an adversarial constraint during the training of the voice conversion model. Then, we leverage a pseudo-Siamese network architecture to learn from the black-box SID model constraining both intrinsic similarity and structural similarity simultaneously. The intrinsic similarity loss is to learn an intrinsic invariance, while the structural similarity loss is to ensure that the substitute SID model shares a similar decision boundary to the fixed black-box SID model. The substitute model can be used as a proxy to generate timbre-reserved fake audio for attacking. Experimental results on the Audio Deepfake Detection (ADD) challenge dataset indicate that the attack success rate of our proposed approach yields up to 60.58% and 55.38% in the white-box and black-box scenarios, respectively, and can deceive both human beings and machines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.19020v1-abstract-full').style.display = 'none'; document.getElementById('2305.19020v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.13716">arXiv:2305.13716</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2305.13716">pdf</a>, <a href="https://arxiv.org/format/2305.13716">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> BA-SOT: Boundary-Aware Serialized Output Training for Multi-Talker ASR </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Liang%2C+Y">Yuhao Liang</a>, <a href="/search/eess?searchtype=author&amp;query=Yu%2C+F">Fan Yu</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+Y">Yangze Li</a>, <a href="/search/eess?searchtype=author&amp;query=Guo%2C+P">Pengcheng Guo</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+S">Shiliang Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+Q">Qian Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Xie%2C+L">Lei Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.13716v3-abstract-short" style="display: inline;"> The recently proposed serialized output training (SOT) simplifies multi-talker automatic speech recognition (ASR) by generating speaker transcriptions separated by a special token. However, frequent speaker changes can make speaker change prediction difficult. To address this, we propose boundary-aware serialized output training (BA-SOT), which explicitly incorporates boundary knowledge into the d&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.13716v3-abstract-full').style.display = 'inline'; document.getElementById('2305.13716v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.13716v3-abstract-full" style="display: none;"> The recently proposed serialized output training (SOT) simplifies multi-talker automatic speech recognition (ASR) by generating speaker transcriptions separated by a special token. However, frequent speaker changes can make speaker change prediction difficult. To address this, we propose boundary-aware serialized output training (BA-SOT), which explicitly incorporates boundary knowledge into the decoder via a speaker change detection task and boundary constraint loss. We also introduce a two-stage connectionist temporal classification (CTC) strategy that incorporates token-level SOT CTC to restore temporal context information. Besides typical character error rate (CER), we introduce utterance-dependent character error rate (UD-CER) to further measure the precision of speaker change prediction. Compared to original SOT, BA-SOT reduces CER/UD-CER by 5.1%/14.0%, and leveraging a pre-trained ASR model for BA-SOT model initialization further reduces CER/UD-CER by 8.4%/19.9%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.13716v3-abstract-full').style.display = 'none'; document.getElementById('2305.13716v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by INTERSPEECH 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.13629">arXiv:2305.13629</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2305.13629">pdf</a>, <a href="https://arxiv.org/format/2305.13629">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.21437/Interspeech.2023-746">10.21437/Interspeech.2023-746 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> TranUSR: Phoneme-to-word Transcoder Based Unified Speech Representation Learning for Cross-lingual Speech Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Xue%2C+H">Hongfei Xue</a>, <a href="/search/eess?searchtype=author&amp;query=Shao%2C+Q">Qijie Shao</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+P">Peikun Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Guo%2C+P">Pengcheng Guo</a>, <a href="/search/eess?searchtype=author&amp;query=Xie%2C+L">Lei Xie</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+J">Jie Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.13629v3-abstract-short" style="display: inline;"> UniSpeech has achieved superior performance in cross-lingual automatic speech recognition (ASR) by explicitly aligning latent representations to phoneme units using multi-task self-supervised learning. While the learned representations transfer well from high-resource to low-resource languages, predicting words directly from these phonetic representations in downstream ASR is challenging. In this&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.13629v3-abstract-full').style.display = 'inline'; document.getElementById('2305.13629v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.13629v3-abstract-full" style="display: none;"> UniSpeech has achieved superior performance in cross-lingual automatic speech recognition (ASR) by explicitly aligning latent representations to phoneme units using multi-task self-supervised learning. While the learned representations transfer well from high-resource to low-resource languages, predicting words directly from these phonetic representations in downstream ASR is challenging. In this paper, we propose TranUSR, a two-stage model comprising a pre-trained UniData2vec and a phoneme-to-word Transcoder. Different from UniSpeech, UniData2vec replaces the quantized discrete representations with continuous and contextual representations from a teacher model for phonetically-aware pre-training. Then, Transcoder learns to translate phonemes to words with the aid of extra texts, enabling direct word generation. Experiments on Common Voice show that UniData2vec reduces PER by 5.3% compared to UniSpeech, while Transcoder yields a 14.4% WER reduction compared to grapheme fine-tuning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.13629v3-abstract-full').style.display = 'none'; document.getElementById('2305.13629v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 3 figures. Accepted by INTERSPEECH 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.12493">arXiv:2305.12493</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2305.12493">pdf</a>, <a href="https://arxiv.org/format/2305.12493">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Contextualized End-to-End Speech Recognition with Contextual Phrase Prediction Network </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Huang%2C+K">Kaixun Huang</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+A">Ao Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Yang%2C+Z">Zhanheng Yang</a>, <a href="/search/eess?searchtype=author&amp;query=Guo%2C+P">Pengcheng Guo</a>, <a href="/search/eess?searchtype=author&amp;query=Mu%2C+B">Bingshen Mu</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+T">Tianyi Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Xie%2C+L">Lei Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.12493v5-abstract-short" style="display: inline;"> Contextual information plays a crucial role in speech recognition technologies and incorporating it into the end-to-end speech recognition models has drawn immense interest recently. However, previous deep bias methods lacked explicit supervision for bias tasks. In this study, we introduce a contextual phrase prediction network for an attention-based deep bias method. This network predicts context&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.12493v5-abstract-full').style.display = 'inline'; document.getElementById('2305.12493v5-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.12493v5-abstract-full" style="display: none;"> Contextual information plays a crucial role in speech recognition technologies and incorporating it into the end-to-end speech recognition models has drawn immense interest recently. However, previous deep bias methods lacked explicit supervision for bias tasks. In this study, we introduce a contextual phrase prediction network for an attention-based deep bias method. This network predicts context phrases in utterances using contextual embeddings and calculates bias loss to assist in the training of the contextualized model. Our method achieved a significant word error rate (WER) reduction across various end-to-end speech recognition models. Experiments on the LibriSpeech corpus show that our proposed model obtains a 12.1% relative WER improvement over the baseline model, and the WER of the context phrases decreases relatively by 40.5%. Moreover, by applying a context phrase filtering strategy, we also effectively eliminate the WER degradation when using a larger biasing list. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.12493v5-abstract-full').style.display = 'none'; document.getElementById('2305.12493v5-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by interspeech2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2303.06341">arXiv:2303.06341</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2303.06341">pdf</a>, <a href="https://arxiv.org/format/2303.06341">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> The NPU-ASLP System for Audio-Visual Speech Recognition in MISP 2022 Challenge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Guo%2C+P">Pengcheng Guo</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+H">He Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Mu%2C+B">Bingshen Mu</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+A">Ao Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+P">Peikun Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2303.06341v1-abstract-short" style="display: inline;"> This paper describes our NPU-ASLP system for the Audio-Visual Diarization and Recognition (AVDR) task in the Multi-modal Information based Speech Processing (MISP) 2022 Challenge. Specifically, the weighted prediction error (WPE) and guided source separation (GSS) techniques are used to reduce reverberation and generate clean signals for each single speaker first. Then, we explore the effectivenes&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.06341v1-abstract-full').style.display = 'inline'; document.getElementById('2303.06341v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2303.06341v1-abstract-full" style="display: none;"> This paper describes our NPU-ASLP system for the Audio-Visual Diarization and Recognition (AVDR) task in the Multi-modal Information based Speech Processing (MISP) 2022 Challenge. Specifically, the weighted prediction error (WPE) and guided source separation (GSS) techniques are used to reduce reverberation and generate clean signals for each single speaker first. Then, we explore the effectiveness of Branchformer and E-Branchformer based ASR systems. To better make use of the visual modality, a cross-attention based multi-modal fusion module is proposed, which explicitly learns the contextual relationship between different modalities. Experiments show that our system achieves a concatenated minimum-permutation character error rate (cpCER) of 28.13\% and 31.21\% on the Dev and Eval set, and obtains second place in the challenge. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.06341v1-abstract-full').style.display = 'none'; document.getElementById('2303.06341v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">2 pages, accepted by ICASSP 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2302.13523">arXiv:2302.13523</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2302.13523">pdf</a>, <a href="https://arxiv.org/format/2302.13523">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> VE-KWS: Visual Modality Enhanced End-to-End Keyword Spotting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+A">Ao Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+H">He Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Guo%2C+P">Pengcheng Guo</a>, <a href="/search/eess?searchtype=author&amp;query=Fu%2C+Y">Yihui Fu</a>, <a href="/search/eess?searchtype=author&amp;query=Xie%2C+L">Lei Xie</a>, <a href="/search/eess?searchtype=author&amp;query=Gao%2C+Y">Yingying Gao</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+S">Shilei Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Feng%2C+J">Junlan Feng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2302.13523v2-abstract-short" style="display: inline;"> The performance of the keyword spotting (KWS) system based on audio modality, commonly measured in false alarms and false rejects, degrades significantly under the far field and noisy conditions. Therefore, audio-visual keyword spotting, which leverages complementary relationships over multiple modalities, has recently gained much attention. However, current studies mainly focus on combining the e&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.13523v2-abstract-full').style.display = 'inline'; document.getElementById('2302.13523v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2302.13523v2-abstract-full" style="display: none;"> The performance of the keyword spotting (KWS) system based on audio modality, commonly measured in false alarms and false rejects, degrades significantly under the far field and noisy conditions. Therefore, audio-visual keyword spotting, which leverages complementary relationships over multiple modalities, has recently gained much attention. However, current studies mainly focus on combining the exclusively learned representations of different modalities, instead of exploring the modal relationships during each respective modeling. In this paper, we propose a novel visual modality enhanced end-to-end KWS framework (VE-KWS), which fuses audio and visual modalities from two aspects. The first one is utilizing the speaker location information obtained from the lip region in videos to assist the training of multi-channel audio beamformer. By involving the beamformer as an audio enhancement module, the acoustic distortions, caused by the far field or noisy environments, could be significantly suppressed. The other one is conducting cross-attention between different modalities to capture the inter-modal relationships and help the representation learning of each modality. Experiments on the MSIP challenge corpus show that our proposed model achieves 2.79% false rejection rate and 2.95% false alarm rate on the Eval set, resulting in a new SOTA performance compared with the top-ranking systems in the ICASSP2022 MISP challenge. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.13523v2-abstract-full').style.display = 'none'; document.getElementById('2302.13523v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 February, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages. Accepted at ICASSP2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2211.13443">arXiv:2211.13443</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2211.13443">pdf</a>, <a href="https://arxiv.org/format/2211.13443">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> TESSP: Text-Enhanced Self-Supervised Speech Pre-training </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Yao%2C+Z">Zhuoyuan Yao</a>, <a href="/search/eess?searchtype=author&amp;query=Ren%2C+S">Shuo Ren</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+S">Sanyuan Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Ma%2C+Z">Ziyang Ma</a>, <a href="/search/eess?searchtype=author&amp;query=Guo%2C+P">Pengcheng Guo</a>, <a href="/search/eess?searchtype=author&amp;query=Xie%2C+L">Lei Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2211.13443v1-abstract-short" style="display: inline;"> Self-supervised speech pre-training empowers the model with the contextual structure inherent in the speech signal while self-supervised text pre-training empowers the model with linguistic information. Both of them are beneficial for downstream speech tasks such as ASR. However, the distinct pre-training objectives make it challenging to jointly optimize the speech and text representation in the&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.13443v1-abstract-full').style.display = 'inline'; document.getElementById('2211.13443v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2211.13443v1-abstract-full" style="display: none;"> Self-supervised speech pre-training empowers the model with the contextual structure inherent in the speech signal while self-supervised text pre-training empowers the model with linguistic information. Both of them are beneficial for downstream speech tasks such as ASR. However, the distinct pre-training objectives make it challenging to jointly optimize the speech and text representation in the same model. To solve this problem, we propose Text-Enhanced Self-Supervised Speech Pre-training (TESSP), aiming to incorporate the linguistic information into speech pre-training. Our model consists of three parts, i.e., a speech encoder, a text encoder and a shared encoder. The model takes unsupervised speech and text data as the input and leverages the common HuBERT and MLM losses respectively. We also propose phoneme up-sampling and representation swapping to enable joint modeling of the speech and text information. Specifically, to fix the length mismatching problem between speech and text data, we phonemize the text sequence and up-sample the phonemes with the alignment information extracted from a small set of supervised data. Moreover, to close the gap between the learned speech and text representations, we swap the text representation with the speech representation extracted by the respective private encoders according to the alignment information. Experiments on the Librispeech dataset shows the proposed TESSP model achieves more than 10% improvement compared with WavLM on the test-clean and test-other sets. We also evaluate our model on the SUPERB benchmark, showing our model has better performance on Phoneme Recognition, Acoustic Speech Recognition and Speech Translation compared with WavLM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.13443v1-abstract-full').style.display = 'none'; document.getElementById('2211.13443v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, 4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2211.03038">arXiv:2211.03038</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2211.03038">pdf</a>, <a href="https://arxiv.org/format/2211.03038">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Distinguishable Speaker Anonymization based on Formant and Fundamental Frequency Scaling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Yao%2C+J">Jixun Yao</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+Q">Qing Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Lei%2C+Y">Yi Lei</a>, <a href="/search/eess?searchtype=author&amp;query=Guo%2C+P">Pengcheng Guo</a>, <a href="/search/eess?searchtype=author&amp;query=Xie%2C+L">Lei Xie</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+N">Namin Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+J">Jie Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2211.03038v1-abstract-short" style="display: inline;"> Speech data on the Internet are proliferating exponentially because of the emergence of social media, and the sharing of such personal data raises obvious security and privacy concerns. One solution to mitigate these concerns involves concealing speaker identities before sharing speech data, also referred to as speaker anonymization. In our previous work, we have developed an automatic speaker ver&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.03038v1-abstract-full').style.display = 'inline'; document.getElementById('2211.03038v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2211.03038v1-abstract-full" style="display: none;"> Speech data on the Internet are proliferating exponentially because of the emergence of social media, and the sharing of such personal data raises obvious security and privacy concerns. One solution to mitigate these concerns involves concealing speaker identities before sharing speech data, also referred to as speaker anonymization. In our previous work, we have developed an automatic speaker verification (ASV)-model-free anonymization framework to protect speaker privacy while preserving speech intelligibility. Although the framework ranked first place in VoicePrivacy 2022 challenge, the anonymization was imperfect, since the speaker distinguishability of the anonymized speech was deteriorated. To address this issue, in this paper, we directly model the formant distribution and fundamental frequency (F0) to represent speaker identity and anonymize the source speech by the uniformly scaling formant and F0. By directly scaling the formant and F0, the speaker distinguishability degradation of the anonymized speech caused by the introduction of other speakers is prevented. The experimental results demonstrate that our proposed framework can improve the speaker distinguishability and significantly outperforms our previous framework in voice distinctiveness. Furthermore, our proposed method also can trade off the privacy-utility by using different scaling factors. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.03038v1-abstract-full').style.display = 'none'; document.getElementById('2211.03038v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to ICASSP 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2211.03036">arXiv:2211.03036</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2211.03036">pdf</a>, <a href="https://arxiv.org/format/2211.03036">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Preserving background sound in noise-robust voice conversion via multi-task learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Yao%2C+J">Jixun Yao</a>, <a href="/search/eess?searchtype=author&amp;query=Lei%2C+Y">Yi Lei</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+Q">Qing Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Guo%2C+P">Pengcheng Guo</a>, <a href="/search/eess?searchtype=author&amp;query=Ning%2C+Z">Ziqian Ning</a>, <a href="/search/eess?searchtype=author&amp;query=Xie%2C+L">Lei Xie</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+H">Hai Li</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+J">Junhui Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Xie%2C+D">Danming Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2211.03036v1-abstract-short" style="display: inline;"> Background sound is an informative form of art that is helpful in providing a more immersive experience in real-application voice conversion (VC) scenarios. However, prior research about VC, mainly focusing on clean voices, pay rare attention to VC with background sound. The critical problem for preserving background sound in VC is inevitable speech distortion by the neural separation model and th&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.03036v1-abstract-full').style.display = 'inline'; document.getElementById('2211.03036v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2211.03036v1-abstract-full" style="display: none;"> Background sound is an informative form of art that is helpful in providing a more immersive experience in real-application voice conversion (VC) scenarios. However, prior research about VC, mainly focusing on clean voices, pay rare attention to VC with background sound. The critical problem for preserving background sound in VC is inevitable speech distortion by the neural separation model and the cascade mismatch between the source separation model and the VC model. In this paper, we propose an end-to-end framework via multi-task learning which sequentially cascades a source separation (SS) module, a bottleneck feature extraction module and a VC module. Specifically, the source separation task explicitly considers critical phase information and confines the distortion caused by the imperfect separation process. The source separation task, the typical VC task and the unified task shares a uniform reconstruction loss constrained by joint training to reduce the mismatch between the SS and VC modules. Experimental results demonstrate that our proposed framework significantly outperforms the baseline systems while achieving comparable quality and speaker similarity to the VC models trained with clean data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.03036v1-abstract-full').style.display = 'none'; document.getElementById('2211.03036v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to ICASSP 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2210.05265">arXiv:2210.05265</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2210.05265">pdf</a>, <a href="https://arxiv.org/format/2210.05265">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> MFCCA:Multi-Frame Cross-Channel attention for multi-speaker ASR in Multi-party meeting scenario </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Yu%2C+F">Fan Yu</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+S">Shiliang Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Guo%2C+P">Pengcheng Guo</a>, <a href="/search/eess?searchtype=author&amp;query=Liang%2C+Y">Yuhao Liang</a>, <a href="/search/eess?searchtype=author&amp;query=Du%2C+Z">Zhihao Du</a>, <a href="/search/eess?searchtype=author&amp;query=Lin%2C+Y">Yuxiao Lin</a>, <a href="/search/eess?searchtype=author&amp;query=Xie%2C+L">Lei Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2210.05265v1-abstract-short" style="display: inline;"> Recently cross-channel attention, which better leverages multi-channel signals from microphone array, has shown promising results in the multi-party meeting scenario. Cross-channel attention focuses on either learning global correlations between sequences of different channels or exploiting fine-grained channel-wise information effectively at each time step. Considering the delay of microphone arr&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.05265v1-abstract-full').style.display = 'inline'; document.getElementById('2210.05265v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2210.05265v1-abstract-full" style="display: none;"> Recently cross-channel attention, which better leverages multi-channel signals from microphone array, has shown promising results in the multi-party meeting scenario. Cross-channel attention focuses on either learning global correlations between sequences of different channels or exploiting fine-grained channel-wise information effectively at each time step. Considering the delay of microphone array receiving sound, we propose a multi-frame cross-channel attention, which models cross-channel information between adjacent frames to exploit the complementarity of both frame-wise and channel-wise knowledge. Besides, we also propose a multi-layer convolutional mechanism to fuse the multi-channel output and a channel masking strategy to combat the channel number mismatch problem between training and inference. Experiments on the AliMeeting, a real-world corpus, reveal that our proposed model outperforms single-channel model by 31.7\% and 37.0\% CER reduction on Eval and Test sets. Moreover, with comparable model parameters and training data, our proposed model achieves a new SOTA performance on the AliMeeting corpus, as compared with the top ranking systems in the ICASSP2022 M2MeT challenge, a recently held multi-channel multi-speaker ASR challenge. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.05265v1-abstract-full').style.display = 'none'; document.getElementById('2210.05265v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by SLT 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2209.11969">arXiv:2209.11969</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2209.11969">pdf</a>, <a href="https://arxiv.org/format/2209.11969">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> NWPU-ASLP System for the VoicePrivacy 2022 Challenge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Yao%2C+J">Jixun Yao</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+Q">Qing Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+L">Li Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Guo%2C+P">Pengcheng Guo</a>, <a href="/search/eess?searchtype=author&amp;query=Liang%2C+Y">Yuhao Liang</a>, <a href="/search/eess?searchtype=author&amp;query=Xie%2C+L">Lei Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2209.11969v1-abstract-short" style="display: inline;"> This paper presents the NWPU-ASLP speaker anonymization system for VoicePrivacy 2022 Challenge. Our submission does not involve additional Automatic Speaker Verification (ASV) model or x-vector pool. Our system consists of four modules, including feature extractor, acoustic model, anonymization module, and neural vocoder. First, the feature extractor extracts the Phonetic Posteriorgram (PPG) and p&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2209.11969v1-abstract-full').style.display = 'inline'; document.getElementById('2209.11969v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2209.11969v1-abstract-full" style="display: none;"> This paper presents the NWPU-ASLP speaker anonymization system for VoicePrivacy 2022 Challenge. Our submission does not involve additional Automatic Speaker Verification (ASV) model or x-vector pool. Our system consists of four modules, including feature extractor, acoustic model, anonymization module, and neural vocoder. First, the feature extractor extracts the Phonetic Posteriorgram (PPG) and pitch from the input speech signal. Then, we reserve a pseudo speaker ID from a speaker look-up table (LUT), which is subsequently fed into a speaker encoder to generate the pseudo speaker embedding that is not corresponding to any real speaker. To ensure the pseudo speaker is distinguishable, we further average the randomly selected speaker embedding and weighted concatenate it with the pseudo speaker embedding to generate the anonymized speaker embedding. Finally, the acoustic model outputs the anonymized mel-spectrogram from the anonymized speaker embedding and a modified version of HifiGAN transforms the mel-spectrogram into the anonymized speech waveform. Experimental results demonstrate the effectiveness of our proposed anonymization system. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2209.11969v1-abstract-full').style.display = 'none'; document.getElementById('2209.11969v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 September, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">VoicePrivacy 2022 Challenge</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2207.00883">arXiv:2207.00883</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2207.00883">pdf</a>, <a href="https://arxiv.org/format/2207.00883">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Improving Transformer-based Conversational ASR by Inter-Sentential Attention Mechanism </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Wei%2C+K">Kun Wei</a>, <a href="/search/eess?searchtype=author&amp;query=Guo%2C+P">Pengcheng Guo</a>, <a href="/search/eess?searchtype=author&amp;query=Jiang%2C+N">Ning Jiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2207.00883v1-abstract-short" style="display: inline;"> Transformer-based models have demonstrated their effectiveness in automatic speech recognition (ASR) tasks and even shown superior performance over the conventional hybrid framework. The main idea of Transformers is to capture the long-range global context within an utterance by self-attention layers. However, for scenarios like conversational speech, such utterance-level modeling will neglect con&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2207.00883v1-abstract-full').style.display = 'inline'; document.getElementById('2207.00883v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2207.00883v1-abstract-full" style="display: none;"> Transformer-based models have demonstrated their effectiveness in automatic speech recognition (ASR) tasks and even shown superior performance over the conventional hybrid framework. The main idea of Transformers is to capture the long-range global context within an utterance by self-attention layers. However, for scenarios like conversational speech, such utterance-level modeling will neglect contextual dependencies that span across utterances. In this paper, we propose to explicitly model the inter-sentential information in a Transformer based end-to-end architecture for conversational speech recognition. Specifically, for the encoder network, we capture the contexts of previous speech and incorporate such historic information into current input by a context-aware residual attention mechanism. For the decoder, the prediction of current utterance is also conditioned on the historic linguistic information through a conditional decoder framework. We show the effectiveness of our proposed method on several open-source dialogue corpora and the proposed method consistently improved the performance from the utterance-level Transformer-based ASR models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2207.00883v1-abstract-full').style.display = 'none'; document.getElementById('2207.00883v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 July, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by Interspeech2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2206.06065">arXiv:2206.06065</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2206.06065">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Deep ensemble learning for segmenting tuberculosis-consistent manifestations in chest radiographs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Rajaraman%2C+S">Sivaramakrishnan Rajaraman</a>, <a href="/search/eess?searchtype=author&amp;query=Yang%2C+F">Feng Yang</a>, <a href="/search/eess?searchtype=author&amp;query=Zamzmi%2C+G">Ghada Zamzmi</a>, <a href="/search/eess?searchtype=author&amp;query=Guo%2C+P">Peng Guo</a>, <a href="/search/eess?searchtype=author&amp;query=Xue%2C+Z">Zhiyun Xue</a>, <a href="/search/eess?searchtype=author&amp;query=Antani%2C+S+K">Sameer K Antani</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2206.06065v1-abstract-short" style="display: inline;"> Automated segmentation of tuberculosis (TB)-consistent lesions in chest X-rays (CXRs) using deep learning (DL) methods can help reduce radiologist effort, supplement clinical decision-making, and potentially result in improved patient treatment. The majority of works in the literature discuss training automatic segmentation models using coarse bounding box annotations. However, the granularity of&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.06065v1-abstract-full').style.display = 'inline'; document.getElementById('2206.06065v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2206.06065v1-abstract-full" style="display: none;"> Automated segmentation of tuberculosis (TB)-consistent lesions in chest X-rays (CXRs) using deep learning (DL) methods can help reduce radiologist effort, supplement clinical decision-making, and potentially result in improved patient treatment. The majority of works in the literature discuss training automatic segmentation models using coarse bounding box annotations. However, the granularity of the bounding box annotation could result in the inclusion of a considerable fraction of false positives and negatives at the pixel level that may adversely impact overall semantic segmentation performance. This study (i) evaluates the benefits of using fine-grained annotations of TB-consistent lesions and (ii) trains and constructs ensembles of the variants of U-Net models for semantically segmenting TB-consistent lesions in both original and bone-suppressed frontal CXRs. We evaluated segmentation performance using several ensemble methods such as bitwise AND, bitwise-OR, bitwise-MAX, and stacking. We observed that the stacking ensemble demonstrated superior segmentation performance (Dice score: 0.5743, 95% confidence interval: (0.4055,0.7431)) compared to the individual constituent models and other ensemble methods. To the best of our knowledge, this is the first study to apply ensemble learning to improve fine-grained TB-consistent lesion segmentation performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.06065v1-abstract-full').style.display = 'none'; document.getElementById('2206.06065v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 June, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 6 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68T07 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2204.11669">arXiv:2204.11669</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2204.11669">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Medical Physics">physics.med-ph</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1038/s41746-023-00859-y">10.1038/s41746-023-00859-y <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Deep-learning-enabled Brain Hemodynamic Mapping Using Resting-state fMRI </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Hou%2C+X">Xirui Hou</a>, <a href="/search/eess?searchtype=author&amp;query=Guo%2C+P">Pengfei Guo</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+P">Puyang Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+P">Peiying Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Lin%2C+D+D+M">Doris D. M. Lin</a>, <a href="/search/eess?searchtype=author&amp;query=Fan%2C+H">Hongli Fan</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+Y">Yang Li</a>, <a href="/search/eess?searchtype=author&amp;query=Wei%2C+Z">Zhiliang Wei</a>, <a href="/search/eess?searchtype=author&amp;query=Lin%2C+Z">Zixuan Lin</a>, <a href="/search/eess?searchtype=author&amp;query=Jiang%2C+D">Dengrong Jiang</a>, <a href="/search/eess?searchtype=author&amp;query=Jin%2C+J">Jin Jin</a>, <a href="/search/eess?searchtype=author&amp;query=Kelly%2C+C">Catherine Kelly</a>, <a href="/search/eess?searchtype=author&amp;query=Pillai%2C+J+J">Jay J. Pillai</a>, <a href="/search/eess?searchtype=author&amp;query=Huang%2C+J">Judy Huang</a>, <a href="/search/eess?searchtype=author&amp;query=Pinho%2C+M+C">Marco C. Pinho</a>, <a href="/search/eess?searchtype=author&amp;query=Thomas%2C+B+P">Binu P. Thomas</a>, <a href="/search/eess?searchtype=author&amp;query=Welch%2C+B+G">Babu G. Welch</a>, <a href="/search/eess?searchtype=author&amp;query=Park%2C+D+C">Denise C. Park</a>, <a href="/search/eess?searchtype=author&amp;query=Patel%2C+V+M">Vishal M. Patel</a>, <a href="/search/eess?searchtype=author&amp;query=Hillis%2C+A+E">Argye E. Hillis</a>, <a href="/search/eess?searchtype=author&amp;query=Lu%2C+H">Hanzhang Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2204.11669v1-abstract-short" style="display: inline;"> Cerebrovascular disease is a leading cause of death globally. Prevention and early intervention are known to be the most effective forms of its management. Non-invasive imaging methods hold great promises for early stratification, but at present lack the sensitivity for personalized prognosis. Resting-state functional magnetic resonance imaging (rs-fMRI), a powerful tool previously used for mappin&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2204.11669v1-abstract-full').style.display = 'inline'; document.getElementById('2204.11669v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2204.11669v1-abstract-full" style="display: none;"> Cerebrovascular disease is a leading cause of death globally. Prevention and early intervention are known to be the most effective forms of its management. Non-invasive imaging methods hold great promises for early stratification, but at present lack the sensitivity for personalized prognosis. Resting-state functional magnetic resonance imaging (rs-fMRI), a powerful tool previously used for mapping neural activity, is available in most hospitals. Here we show that rs-fMRI can be used to map cerebral hemodynamic function and delineate impairment. By exploiting time variations in breathing pattern during rs-fMRI, deep learning enables reproducible mapping of cerebrovascular reactivity (CVR) and bolus arrive time (BAT) of the human brain using resting-state CO2 fluctuations as a natural &#39;contrast media&#39;. The deep-learning network was trained with CVR and BAT maps obtained with a reference method of CO2-inhalation MRI, which included data from young and older healthy subjects and patients with Moyamoya disease and brain tumors. We demonstrate the performance of deep-learning cerebrovascular mapping in the detection of vascular abnormalities, evaluation of revascularization effects, and vascular alterations in normal aging. In addition, cerebrovascular maps obtained with the proposed method exhibited excellent reproducibility in both healthy volunteers and stroke patients. Deep-learning resting-state vascular imaging has the potential to become a useful tool in clinical cerebrovascular imaging. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2204.11669v1-abstract-full').style.display = 'none'; document.getElementById('2204.11669v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 April, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> npj Digital Medicine (2023) 116 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2204.03398">arXiv:2204.03398</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2204.03398">pdf</a>, <a href="https://arxiv.org/format/2204.03398">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Linguistic-Acoustic Similarity Based Accent Shift for Accent Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Shao%2C+Q">Qijie Shao</a>, <a href="/search/eess?searchtype=author&amp;query=Yan%2C+J">Jinghao Yan</a>, <a href="/search/eess?searchtype=author&amp;query=Kang%2C+J">Jian Kang</a>, <a href="/search/eess?searchtype=author&amp;query=Guo%2C+P">Pengcheng Guo</a>, <a href="/search/eess?searchtype=author&amp;query=Shi%2C+X">Xian Shi</a>, <a href="/search/eess?searchtype=author&amp;query=Hu%2C+P">Pengfei Hu</a>, <a href="/search/eess?searchtype=author&amp;query=Xie%2C+L">Lei Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2204.03398v2-abstract-short" style="display: inline;"> General accent recognition (AR) models tend to directly extract low-level information from spectrums, which always significantly overfit on speakers or channels. Considering accent can be regarded as a series of shifts relative to native pronunciation, distinguishing accents will be an easier task with accent shift as input. But due to the lack of native utterance as an anchor, estimating the acce&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2204.03398v2-abstract-full').style.display = 'inline'; document.getElementById('2204.03398v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2204.03398v2-abstract-full" style="display: none;"> General accent recognition (AR) models tend to directly extract low-level information from spectrums, which always significantly overfit on speakers or channels. Considering accent can be regarded as a series of shifts relative to native pronunciation, distinguishing accents will be an easier task with accent shift as input. But due to the lack of native utterance as an anchor, estimating the accent shift is difficult. In this paper, we propose linguistic-acoustic similarity based accent shift (LASAS) for AR tasks. For an accent speech utterance, after mapping the corresponding text vector to multiple accent-associated spaces as anchors, its accent shift could be estimated by the similarities between the acoustic embedding and those anchors. Then, we concatenate the accent shift with a dimension-reduced text vector to obtain a linguistic-acoustic bimodal representation. Compared with pure acoustic embedding, the bimodal representation is richer and more clear by taking full advantage of both linguistic and acoustic information, which can effectively improve AR performance. Experiments on Accented English Speech Recognition Challenge (AESRC) dataset show that our method achieves 77.42% accuracy on Test set, obtaining a 6.94% relative improvement over a competitive system in the challenge. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2204.03398v2-abstract-full').style.display = 'none'; document.getElementById('2204.03398v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 July, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 April, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by Interspeech 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2203.06338">arXiv:2203.06338</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2203.06338">pdf</a>, <a href="https://arxiv.org/format/2203.06338">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Auto-FedRL: Federated Hyperparameter Optimization for Multi-institutional Medical Image Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Guo%2C+P">Pengfei Guo</a>, <a href="/search/eess?searchtype=author&amp;query=Yang%2C+D">Dong Yang</a>, <a href="/search/eess?searchtype=author&amp;query=Hatamizadeh%2C+A">Ali Hatamizadeh</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+A">An Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+Z">Ziyue Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+W">Wenqi Li</a>, <a href="/search/eess?searchtype=author&amp;query=Zhao%2C+C">Can Zhao</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+D">Daguang Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Harmon%2C+S">Stephanie Harmon</a>, <a href="/search/eess?searchtype=author&amp;query=Turkbey%2C+E">Evrim Turkbey</a>, <a href="/search/eess?searchtype=author&amp;query=Turkbey%2C+B">Baris Turkbey</a>, <a href="/search/eess?searchtype=author&amp;query=Wood%2C+B">Bradford Wood</a>, <a href="/search/eess?searchtype=author&amp;query=Patella%2C+F">Francesca Patella</a>, <a href="/search/eess?searchtype=author&amp;query=Stellato%2C+E">Elvira Stellato</a>, <a href="/search/eess?searchtype=author&amp;query=Carrafiello%2C+G">Gianpaolo Carrafiello</a>, <a href="/search/eess?searchtype=author&amp;query=Patel%2C+V+M">Vishal M. Patel</a>, <a href="/search/eess?searchtype=author&amp;query=Roth%2C+H+R">Holger R. Roth</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2203.06338v2-abstract-short" style="display: inline;"> Federated learning (FL) is a distributed machine learning technique that enables collaborative model training while avoiding explicit data sharing. The inherent privacy-preserving property of FL algorithms makes them especially attractive to the medical field. However, in case of heterogeneous client data distributions, standard FL methods are unstable and require intensive hyperparameter tuning t&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.06338v2-abstract-full').style.display = 'inline'; document.getElementById('2203.06338v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2203.06338v2-abstract-full" style="display: none;"> Federated learning (FL) is a distributed machine learning technique that enables collaborative model training while avoiding explicit data sharing. The inherent privacy-preserving property of FL algorithms makes them especially attractive to the medical field. However, in case of heterogeneous client data distributions, standard FL methods are unstable and require intensive hyperparameter tuning to achieve optimal performance. Conventional hyperparameter optimization algorithms are impractical in real-world FL applications as they involve numerous training trials, which are often not affordable with limited compute budgets. In this work, we propose an efficient reinforcement learning (RL)-based federated hyperparameter optimization algorithm, termed Auto-FedRL, in which an online RL agent can dynamically adjust hyperparameters of each client based on the current training progress. Extensive experiments are conducted to investigate different search strategies and RL agents. The effectiveness of the proposed method is validated on a heterogeneous data split of the CIFAR-10 dataset as well as two real-world medical image segmentation datasets for COVID-19 lesion segmentation in chest CT and pancreas segmentation in abdominal CT. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.06338v2-abstract-full').style.display = 'none'; document.getElementById('2203.06338v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 August, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 March, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2203.05574">arXiv:2203.05574</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2203.05574">pdf</a>, <a href="https://arxiv.org/format/2203.05574">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> On-the-Fly Test-time Adaptation for Medical Image Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Valanarasu%2C+J+M+J">Jeya Maria Jose Valanarasu</a>, <a href="/search/eess?searchtype=author&amp;query=Guo%2C+P">Pengfei Guo</a>, <a href="/search/eess?searchtype=author&amp;query=VS%2C+V">Vibashan VS</a>, <a href="/search/eess?searchtype=author&amp;query=Patel%2C+V+M">Vishal M. Patel</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2203.05574v1-abstract-short" style="display: inline;"> One major problem in deep learning-based solutions for medical imaging is the drop in performance when a model is tested on a data distribution different from the one that it is trained on. Adapting the source model to target data distribution at test-time is an efficient solution for the data-shift problem. Previous methods solve this by adapting the model to target distribution by using techniqu&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.05574v1-abstract-full').style.display = 'inline'; document.getElementById('2203.05574v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2203.05574v1-abstract-full" style="display: none;"> One major problem in deep learning-based solutions for medical imaging is the drop in performance when a model is tested on a data distribution different from the one that it is trained on. Adapting the source model to target data distribution at test-time is an efficient solution for the data-shift problem. Previous methods solve this by adapting the model to target distribution by using techniques like entropy minimization or regularization. In these methods, the models are still updated by back-propagation using an unsupervised loss on complete test data distribution. In real-world clinical settings, it makes more sense to adapt a model to a new test image on-the-fly and avoid model update during inference due to privacy concerns and lack of computing resource at deployment. To this end, we propose a new setting - On-the-Fly Adaptation which is zero-shot and episodic (i.e., the model is adapted to a single image at a time and also does not perform any back-propagation during test-time). To achieve this, we propose a new framework called Adaptive UNet where each convolutional block is equipped with an adaptive batch normalization layer to adapt the features with respect to a domain code. The domain code is generated using a pre-trained encoder trained on a large corpus of medical images. During test-time, the model takes in just the new test image and generates a domain code to adapt the features of source model according to the test data. We validate the performance on both 2D and 3D data distribution shifts where we get a better performance compared to previous test-time adaptation methods. Code is available at https://github.com/jeya-maria-jose/On-The-Fly-Adaptation <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.05574v1-abstract-full').style.display = 'none'; document.getElementById('2203.05574v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 March, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Tech Report</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2203.04292">arXiv:2203.04292</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2203.04292">pdf</a>, <a href="https://arxiv.org/format/2203.04292">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Towards performant and reliable undersampled MR reconstruction via diffusion model sampling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Peng%2C+C">Cheng Peng</a>, <a href="/search/eess?searchtype=author&amp;query=Guo%2C+P">Pengfei Guo</a>, <a href="/search/eess?searchtype=author&amp;query=Zhou%2C+S+K">S. Kevin Zhou</a>, <a href="/search/eess?searchtype=author&amp;query=Patel%2C+V">Vishal Patel</a>, <a href="/search/eess?searchtype=author&amp;query=Chellappa%2C+R">Rama Chellappa</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2203.04292v2-abstract-short" style="display: inline;"> Magnetic Resonance (MR) image reconstruction from under-sampled acquisition promises faster scanning time. To this end, current State-of-The-Art (SoTA) approaches leverage deep neural networks and supervised training to learn a recovery model. While these approaches achieve impressive performances, the learned model can be fragile on unseen degradation, e.g. when given a different acceleration fac&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.04292v2-abstract-full').style.display = 'inline'; document.getElementById('2203.04292v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2203.04292v2-abstract-full" style="display: none;"> Magnetic Resonance (MR) image reconstruction from under-sampled acquisition promises faster scanning time. To this end, current State-of-The-Art (SoTA) approaches leverage deep neural networks and supervised training to learn a recovery model. While these approaches achieve impressive performances, the learned model can be fragile on unseen degradation, e.g. when given a different acceleration factor. These methods are also generally deterministic and provide a single solution to an ill-posed problem; as such, it can be difficult for practitioners to understand the reliability of the reconstruction. We introduce DiffuseRecon, a novel diffusion model-based MR reconstruction method. DiffuseRecon guides the generation process based on the observed signals and a pre-trained diffusion model, and does not require additional training on specific acceleration factors. DiffuseRecon is stochastic in nature and generates results from a distribution of fully-sampled MR images; as such, it allows us to explicitly visualize different potential reconstruction solutions. Lastly, DiffuseRecon proposes an accelerated, coarse-to-fine Monte-Carlo sampling scheme to approximate the most likely reconstruction candidate. The proposed DiffuseRecon achieves SoTA performances reconstructing from raw acquisition signals in fastMRI and SKM-TEA. Code will be open-sourced at www.github.com/cpeng93/DiffuseRecon. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.04292v2-abstract-full').style.display = 'none'; document.getElementById('2203.04292v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 March, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 March, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2202.03647">arXiv:2202.03647</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2202.03647">pdf</a>, <a href="https://arxiv.org/format/2202.03647">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Summary On The ICASSP 2022 Multi-Channel Multi-Party Meeting Transcription Grand Challenge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Yu%2C+F">Fan Yu</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+S">Shiliang Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Guo%2C+P">Pengcheng Guo</a>, <a href="/search/eess?searchtype=author&amp;query=Fu%2C+Y">Yihui Fu</a>, <a href="/search/eess?searchtype=author&amp;query=Du%2C+Z">Zhihao Du</a>, <a href="/search/eess?searchtype=author&amp;query=Zheng%2C+S">Siqi Zheng</a>, <a href="/search/eess?searchtype=author&amp;query=Huang%2C+W">Weilong Huang</a>, <a href="/search/eess?searchtype=author&amp;query=Xie%2C+L">Lei Xie</a>, <a href="/search/eess?searchtype=author&amp;query=Tan%2C+Z">Zheng-Hua Tan</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+D">DeLiang Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Qian%2C+Y">Yanmin Qian</a>, <a href="/search/eess?searchtype=author&amp;query=Lee%2C+K+A">Kong Aik Lee</a>, <a href="/search/eess?searchtype=author&amp;query=Yan%2C+Z">Zhijie Yan</a>, <a href="/search/eess?searchtype=author&amp;query=Ma%2C+B">Bin Ma</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+X">Xin Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Bu%2C+H">Hui Bu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2202.03647v2-abstract-short" style="display: inline;"> The ICASSP 2022 Multi-channel Multi-party Meeting Transcription Grand Challenge (M2MeT) focuses on one of the most valuable and the most challenging scenarios of speech technologies. The M2MeT challenge has particularly set up two tracks, speaker diarization (track 1) and multi-speaker automatic speech recognition (ASR) (track 2). Along with the challenge, we released 120 hours of real-recorded Ma&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2202.03647v2-abstract-full').style.display = 'inline'; document.getElementById('2202.03647v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2202.03647v2-abstract-full" style="display: none;"> The ICASSP 2022 Multi-channel Multi-party Meeting Transcription Grand Challenge (M2MeT) focuses on one of the most valuable and the most challenging scenarios of speech technologies. The M2MeT challenge has particularly set up two tracks, speaker diarization (track 1) and multi-speaker automatic speech recognition (ASR) (track 2). Along with the challenge, we released 120 hours of real-recorded Mandarin meeting speech data with manual annotation, including far-field data collected by 8-channel microphone array as well as near-field data collected by each participants&#39; headset microphone. We briefly describe the released dataset, track setups, baselines and summarize the challenge results and major techniques used in the submissions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2202.03647v2-abstract-full').style.display = 'none'; document.getElementById('2202.03647v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 February, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 February, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICASSP 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2201.09376">arXiv:2201.09376</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2201.09376">pdf</a>, <a href="https://arxiv.org/format/2201.09376">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ReconFormer: Accelerated MRI Reconstruction Using Recurrent Transformer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Guo%2C+P">Pengfei Guo</a>, <a href="/search/eess?searchtype=author&amp;query=Mei%2C+Y">Yiqun Mei</a>, <a href="/search/eess?searchtype=author&amp;query=Zhou%2C+J">Jinyuan Zhou</a>, <a href="/search/eess?searchtype=author&amp;query=Jiang%2C+S">Shanshan Jiang</a>, <a href="/search/eess?searchtype=author&amp;query=Patel%2C+V+M">Vishal M. Patel</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2201.09376v2-abstract-short" style="display: inline;"> Accelerating magnetic resonance image (MRI) reconstruction process is a challenging ill-posed inverse problem due to the excessive under-sampling operation in k-space. In this paper, we propose a recurrent transformer model, namely ReconFormer, for MRI reconstruction which can iteratively reconstruct high fertility magnetic resonance images from highly under-sampled k-space data. In particular, th&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2201.09376v2-abstract-full').style.display = 'inline'; document.getElementById('2201.09376v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2201.09376v2-abstract-full" style="display: none;"> Accelerating magnetic resonance image (MRI) reconstruction process is a challenging ill-posed inverse problem due to the excessive under-sampling operation in k-space. In this paper, we propose a recurrent transformer model, namely ReconFormer, for MRI reconstruction which can iteratively reconstruct high fertility magnetic resonance images from highly under-sampled k-space data. In particular, the proposed architecture is built upon Recurrent Pyramid Transformer Layers (RPTL), which jointly exploits intrinsic multi-scale information at every architecture unit as well as the dependencies of the deep feature correlation through recurrent states. Moreover, the proposed ReconFormer is lightweight since it employs the recurrent structure for its parameter efficiency. We validate the effectiveness of ReconFormer on multiple datasets with different magnetic resonance sequences and show that it achieves significant improvements over the state-of-the-art methods with better parameter efficiency. Implementation code will be available in https://github.com/guopengf/ReconFormer. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2201.09376v2-abstract-full').style.display = 'none'; document.getElementById('2201.09376v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 January, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 January, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2111.06707">arXiv:2111.06707</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2111.06707">pdf</a>, <a href="https://arxiv.org/format/2111.06707">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Transformer-based Image Compression </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Lu%2C+M">Ming Lu</a>, <a href="/search/eess?searchtype=author&amp;query=Guo%2C+P">Peiyao Guo</a>, <a href="/search/eess?searchtype=author&amp;query=Shi%2C+H">Huiqing Shi</a>, <a href="/search/eess?searchtype=author&amp;query=Cao%2C+C">Chuntong Cao</a>, <a href="/search/eess?searchtype=author&amp;query=Ma%2C+Z">Zhan Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2111.06707v1-abstract-short" style="display: inline;"> A Transformer-based Image Compression (TIC) approach is developed which reuses the canonical variational autoencoder (VAE) architecture with paired main and hyper encoder-decoders. Both main and hyper encoders are comprised of a sequence of neural transformation units (NTUs) to analyse and aggregate important information for more compact representation of input image, while the decoders mirror the&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2111.06707v1-abstract-full').style.display = 'inline'; document.getElementById('2111.06707v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2111.06707v1-abstract-full" style="display: none;"> A Transformer-based Image Compression (TIC) approach is developed which reuses the canonical variational autoencoder (VAE) architecture with paired main and hyper encoder-decoders. Both main and hyper encoders are comprised of a sequence of neural transformation units (NTUs) to analyse and aggregate important information for more compact representation of input image, while the decoders mirror the encoder-side operations to generate pixel-domain image reconstruction from the compressed bitstream. Each NTU is consist of a Swin Transformer Block (STB) and a convolutional layer (Conv) to best embed both long-range and short-range information; In the meantime, a casual attention module (CAM) is devised for adaptive context modeling of latent features to utilize both hyper and autoregressive priors. The TIC rivals with state-of-the-art approaches including deep convolutional neural networks (CNNs) based learnt image coding (LIC) methods and handcrafted rules-based intra profile of recently-approved Versatile Video Coding (VVC) standard, and requires much less model parameters, e.g., up to 45% reduction to leading-performance LIC. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2111.06707v1-abstract-full').style.display = 'none'; document.getElementById('2111.06707v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2021. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Guo%2C+P&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Guo%2C+P&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Guo%2C+P&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

Pages: 1 2 3 4 5 6 7 8 9 10