CINXE.COM
Search | arXiv e-print repository
<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 321 results for author: <span class="mathjax">Watanabe, S</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> <div class="content"> <form method="GET" action="/search/eess" aria-role="search"> Searching in archive <strong>eess</strong>. <a href="/search/?searchtype=author&query=Watanabe%2C+S">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Watanabe, S"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Watanabe%2C+S&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Watanabe, S"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Watanabe%2C+S&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Watanabe%2C+S&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Watanabe%2C+S&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Watanabe%2C+S&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Watanabe%2C+S&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Watanabe%2C+S&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li> <a href="/search/?searchtype=author&query=Watanabe%2C+S&start=250" class="pagination-link " aria-label="Page 6" aria-current="page">6 </a> </li> <li> <a href="/search/?searchtype=author&query=Watanabe%2C+S&start=300" class="pagination-link " aria-label="Page 7" aria-current="page">7 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05361">arXiv:2411.05361</a> <span> [<a href="https://arxiv.org/pdf/2411.05361">pdf</a>, <a href="https://arxiv.org/format/2411.05361">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Dynamic-SUPERB Phase-2: A Collaboratively Expanding Benchmark for Measuring the Capabilities of Spoken Language Models with 180 Tasks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Huang%2C+C">Chien-yu Huang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+W">Wei-Chih Chen</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+S">Shu-wen Yang</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+A+T">Andy T. Liu</a>, <a href="/search/eess?searchtype=author&query=Li%2C+C">Chen-An Li</a>, <a href="/search/eess?searchtype=author&query=Lin%2C+Y">Yu-Xiang Lin</a>, <a href="/search/eess?searchtype=author&query=Tseng%2C+W">Wei-Cheng Tseng</a>, <a href="/search/eess?searchtype=author&query=Diwan%2C+A">Anuj Diwan</a>, <a href="/search/eess?searchtype=author&query=Shih%2C+Y">Yi-Jen Shih</a>, <a href="/search/eess?searchtype=author&query=Shi%2C+J">Jiatong Shi</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+W">William Chen</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+X">Xuanjun Chen</a>, <a href="/search/eess?searchtype=author&query=Hsiao%2C+C">Chi-Yuan Hsiao</a>, <a href="/search/eess?searchtype=author&query=Peng%2C+P">Puyuan Peng</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+S">Shih-Heng Wang</a>, <a href="/search/eess?searchtype=author&query=Kuan%2C+C">Chun-Yi Kuan</a>, <a href="/search/eess?searchtype=author&query=Lu%2C+K">Ke-Han Lu</a>, <a href="/search/eess?searchtype=author&query=Chang%2C+K">Kai-Wei Chang</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+C">Chih-Kai Yang</a>, <a href="/search/eess?searchtype=author&query=Ritter-Gutierrez%2C+F">Fabian Ritter-Gutierrez</a>, <a href="/search/eess?searchtype=author&query=Chuang%2C+M+T">Ming To Chuang</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+K">Kuan-Po Huang</a>, <a href="/search/eess?searchtype=author&query=Arora%2C+S">Siddhant Arora</a>, <a href="/search/eess?searchtype=author&query=Lin%2C+Y">You-Kuan Lin</a>, <a href="/search/eess?searchtype=author&query=Yeo%2C+E">Eunjung Yeo</a> , et al. (53 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05361v1-abstract-short" style="display: inline;"> Multimodal foundation models, such as Gemini and ChatGPT, have revolutionized human-machine interactions by seamlessly integrating various forms of data. Developing a universal spoken language model that comprehends a wide range of natural language instructions is critical for bridging communication gaps and facilitating more intuitive interactions. However, the absence of a comprehensive evaluati… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05361v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05361v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05361v1-abstract-full" style="display: none;"> Multimodal foundation models, such as Gemini and ChatGPT, have revolutionized human-machine interactions by seamlessly integrating various forms of data. Developing a universal spoken language model that comprehends a wide range of natural language instructions is critical for bridging communication gaps and facilitating more intuitive interactions. However, the absence of a comprehensive evaluation benchmark poses a significant challenge. We present Dynamic-SUPERB Phase-2, an open and evolving benchmark for the comprehensive evaluation of instruction-based universal speech models. Building upon the first generation, this second version incorporates 125 new tasks contributed collaboratively by the global research community, expanding the benchmark to a total of 180 tasks, making it the largest benchmark for speech and audio evaluation. While the first generation of Dynamic-SUPERB was limited to classification tasks, Dynamic-SUPERB Phase-2 broadens its evaluation capabilities by introducing a wide array of novel and diverse tasks, including regression and sequence generation, across speech, music, and environmental audio. Evaluation results indicate that none of the models performed well universally. SALMONN-13B excelled in English ASR, while WavLLM demonstrated high accuracy in emotion recognition, but current models still require further innovations to handle a broader range of tasks. We will soon open-source all task data and the evaluation pipeline. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05361v1-abstract-full').style.display = 'none'; document.getElementById('2411.05361v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.17485">arXiv:2410.17485</a> <span> [<a href="https://arxiv.org/pdf/2410.17485">pdf</a>, <a href="https://arxiv.org/format/2410.17485">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> VoiceTextBlender: Augmenting Large Language Models with Speech Capabilities via Single-Stage Joint Speech-Text Supervised Fine-Tuning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Peng%2C+Y">Yifan Peng</a>, <a href="/search/eess?searchtype=author&query=Puvvada%2C+K+C">Krishna C. Puvvada</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Z">Zhehuai Chen</a>, <a href="/search/eess?searchtype=author&query=Zelasko%2C+P">Piotr Zelasko</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+H">He Huang</a>, <a href="/search/eess?searchtype=author&query=Dhawan%2C+K">Kunal Dhawan</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+K">Ke Hu</a>, <a href="/search/eess?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a>, <a href="/search/eess?searchtype=author&query=Balam%2C+J">Jagadeesh Balam</a>, <a href="/search/eess?searchtype=author&query=Ginsburg%2C+B">Boris Ginsburg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.17485v1-abstract-short" style="display: inline;"> Recent studies have augmented large language models (LLMs) with speech capabilities, leading to the development of speech language models (SpeechLMs). Earlier SpeechLMs focused on single-turn speech-based question answering (QA), where user input comprised a speech context and a text question. More recent studies have extended this to multi-turn conversations, though they often require complex, mu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17485v1-abstract-full').style.display = 'inline'; document.getElementById('2410.17485v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.17485v1-abstract-full" style="display: none;"> Recent studies have augmented large language models (LLMs) with speech capabilities, leading to the development of speech language models (SpeechLMs). Earlier SpeechLMs focused on single-turn speech-based question answering (QA), where user input comprised a speech context and a text question. More recent studies have extended this to multi-turn conversations, though they often require complex, multi-stage supervised fine-tuning (SFT) with diverse data. Another critical challenge with SpeechLMs is catastrophic forgetting-where models optimized for speech tasks suffer significant degradation in text-only performance. To mitigate these issues, we propose a novel single-stage joint speech-text SFT approach on the low-rank adaptation (LoRA) of the LLM backbone. Our joint SFT combines text-only SFT data with three types of speech-related data: speech recognition and translation, speech-based QA, and mixed-modal SFT. Compared to previous SpeechLMs with 7B or 13B parameters, our 3B model demonstrates superior performance across various speech benchmarks while preserving the original capabilities on text-only tasks. Furthermore, our model shows emergent abilities of effectively handling previously unseen prompts and tasks, including multi-turn, mixed-modal inputs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17485v1-abstract-full').style.display = 'none'; document.getElementById('2410.17485v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.03007">arXiv:2410.03007</a> <span> [<a href="https://arxiv.org/pdf/2410.03007">pdf</a>, <a href="https://arxiv.org/format/2410.03007">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> FastAdaSP: Multitask-Adapted Efficient Inference for Large Speech Language Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Lu%2C+Y">Yichen Lu</a>, <a href="/search/eess?searchtype=author&query=Song%2C+J">Jiaqi Song</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+C+H">Chao-Han Huck Yang</a>, <a href="/search/eess?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.03007v1-abstract-short" style="display: inline;"> In this study, we aim to explore Multitask Speech Language Model (SpeechLM) efficient inference via token reduction. Unlike other modalities such as vision or text, speech has unique temporal dependencies, making previous efficient inference works on other modalities not directly applicable. Furthermore, methods for efficient SpeechLM inference on long sequence and sparse signals remain largely un… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03007v1-abstract-full').style.display = 'inline'; document.getElementById('2410.03007v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.03007v1-abstract-full" style="display: none;"> In this study, we aim to explore Multitask Speech Language Model (SpeechLM) efficient inference via token reduction. Unlike other modalities such as vision or text, speech has unique temporal dependencies, making previous efficient inference works on other modalities not directly applicable. Furthermore, methods for efficient SpeechLM inference on long sequence and sparse signals remain largely unexplored. Then we propose FastAdaSP, a weighted token merging framework specifically designed for various speech-related tasks to improve the trade-off between efficiency and performance. Experimental results on WavLLM and Qwen-Audio show that our method achieves the state-of-the-art (SOTA) efficiency-performance trade-off compared with other baseline methods. Specifically, FastAdaSP achieved 7x memory efficiency and 1.83x decoding throughput without any degradation on tasks like Emotion Recognition (ER) and Spoken Question Answering (SQA). The code will be available at https://github.com/yichen14/FastAdaSP <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03007v1-abstract-full').style.display = 'none'; document.getElementById('2410.03007v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">EMNLP 2024 Industry Track</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.00528">arXiv:2410.00528</a> <span> [<a href="https://arxiv.org/pdf/2410.00528">pdf</a>, <a href="https://arxiv.org/format/2410.00528">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> End-to-End Speech Recognition with Pre-trained Masked Language Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Higuchi%2C+Y">Yosuke Higuchi</a>, <a href="/search/eess?searchtype=author&query=Ogawa%2C+T">Tetsuji Ogawa</a>, <a href="/search/eess?searchtype=author&query=Kobayashi%2C+T">Tetsunori Kobayashi</a>, <a href="/search/eess?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.00528v1-abstract-short" style="display: inline;"> We present a novel approach to end-to-end automatic speech recognition (ASR) that utilizes pre-trained masked language models (LMs) to facilitate the extraction of linguistic information. The proposed models, BERT-CTC and BECTRA, are specifically designed to effectively integrate pre-trained LMs (e.g., BERT) into end-to-end ASR models. BERT-CTC adapts BERT for connectionist temporal classification… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00528v1-abstract-full').style.display = 'inline'; document.getElementById('2410.00528v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.00528v1-abstract-full" style="display: none;"> We present a novel approach to end-to-end automatic speech recognition (ASR) that utilizes pre-trained masked language models (LMs) to facilitate the extraction of linguistic information. The proposed models, BERT-CTC and BECTRA, are specifically designed to effectively integrate pre-trained LMs (e.g., BERT) into end-to-end ASR models. BERT-CTC adapts BERT for connectionist temporal classification (CTC) by addressing the constraint of the conditional independence assumption between output tokens. This enables explicit conditioning of BERT's contextualized embeddings in the ASR process, seamlessly merging audio and linguistic information through an iterative refinement algorithm. BECTRA extends BERT-CTC to the transducer framework and trains the decoder network using a vocabulary suitable for ASR training. This aims to bridge the gap between the text processed in end-to-end ASR and BERT, as these models have distinct vocabularies with varying text formats and styles, such as the presence of punctuation. Experimental results on various ASR tasks demonstrate that the proposed models improve over both the CTC and transducer-based baselines, owing to the incorporation of BERT knowledge. Moreover, our in-depth analysis and investigation verify the effectiveness of the proposed formulations and architectural designs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00528v1-abstract-full').style.display = 'none'; document.getElementById('2410.00528v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.18428">arXiv:2409.18428</a> <span> [<a href="https://arxiv.org/pdf/2409.18428">pdf</a>, <a href="https://arxiv.org/format/2409.18428">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Improving Multilingual ASR in the Wild Using Simple N-best Re-ranking </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yan%2C+B">Brian Yan</a>, <a href="/search/eess?searchtype=author&query=Pratap%2C+V">Vineel Pratap</a>, <a href="/search/eess?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a>, <a href="/search/eess?searchtype=author&query=Auli%2C+M">Michael Auli</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.18428v1-abstract-short" style="display: inline;"> Multilingual Automatic Speech Recognition (ASR) models are typically evaluated in a setting where the ground-truth language of the speech utterance is known, however, this is often not the case for most practical settings. Automatic Spoken Language Identification (SLID) models are not perfect and misclassifications have a substantial impact on the final ASR accuracy. In this paper, we present a si… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18428v1-abstract-full').style.display = 'inline'; document.getElementById('2409.18428v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.18428v1-abstract-full" style="display: none;"> Multilingual Automatic Speech Recognition (ASR) models are typically evaluated in a setting where the ground-truth language of the speech utterance is known, however, this is often not the case for most practical settings. Automatic Spoken Language Identification (SLID) models are not perfect and misclassifications have a substantial impact on the final ASR accuracy. In this paper, we present a simple and effective N-best re-ranking approach to improve multilingual ASR accuracy for several prominent acoustic models by employing external features such as language models and text-based language identification models. Our results on FLEURS using the MMS and Whisper models show spoken language identification accuracy improvements of 8.7% and 6.1%, respectively and word error rates which are 3.3% and 2.0% lower on these benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18428v1-abstract-full').style.display = 'none'; document.getElementById('2409.18428v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.17285">arXiv:2409.17285</a> <span> [<a href="https://arxiv.org/pdf/2409.17285">pdf</a>, <a href="https://arxiv.org/format/2409.17285">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> SpoofCeleb: Speech Deepfake Detection and SASV In The Wild </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Jung%2C+J">Jee-weon Jung</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+Y">Yihan Wu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+X">Xin Wang</a>, <a href="/search/eess?searchtype=author&query=Kim%2C+J">Ji-Hoon Kim</a>, <a href="/search/eess?searchtype=author&query=Maiti%2C+S">Soumi Maiti</a>, <a href="/search/eess?searchtype=author&query=Matsunaga%2C+Y">Yuta Matsunaga</a>, <a href="/search/eess?searchtype=author&query=Shim%2C+H">Hye-jin Shim</a>, <a href="/search/eess?searchtype=author&query=Tian%2C+J">Jinchuan Tian</a>, <a href="/search/eess?searchtype=author&query=Evans%2C+N">Nicholas Evans</a>, <a href="/search/eess?searchtype=author&query=Chung%2C+J+S">Joon Son Chung</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+W">Wangyou Zhang</a>, <a href="/search/eess?searchtype=author&query=Um%2C+S">Seyun Um</a>, <a href="/search/eess?searchtype=author&query=Takamichi%2C+S">Shinnosuke Takamichi</a>, <a href="/search/eess?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.17285v1-abstract-short" style="display: inline;"> This paper introduces SpoofCeleb, a dataset designed for Speech Deepfake Detection (SDD) and Spoofing-robust Automatic Speaker Verification (SASV), utilizing source data from real-world conditions and spoofing attacks generated by Text-To-Speech (TTS) systems also trained on the same real-world data. Robust recognition systems require speech data recorded in varied acoustic environments with diffe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.17285v1-abstract-full').style.display = 'inline'; document.getElementById('2409.17285v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.17285v1-abstract-full" style="display: none;"> This paper introduces SpoofCeleb, a dataset designed for Speech Deepfake Detection (SDD) and Spoofing-robust Automatic Speaker Verification (SASV), utilizing source data from real-world conditions and spoofing attacks generated by Text-To-Speech (TTS) systems also trained on the same real-world data. Robust recognition systems require speech data recorded in varied acoustic environments with different levels of noise to be trained. However, existing datasets typically include clean, high-quality recordings (bona fide data) due to the requirements for TTS training; studio-quality or well-recorded read speech is typically necessary to train TTS models. Existing SDD datasets also have limited usefulness for training SASV models due to insufficient speaker diversity. We present SpoofCeleb, which leverages a fully automated pipeline that processes the VoxCeleb1 dataset, transforming it into a suitable form for TTS training. We subsequently train 23 contemporary TTS systems. The resulting SpoofCeleb dataset comprises over 2.5 million utterances from 1,251 unique speakers, collected under natural, real-world conditions. The dataset includes carefully partitioned training, validation, and evaluation sets with well-controlled experimental protocols. We provide baseline results for both SDD and SASV tasks. All data, protocols, and baselines are publicly available at https://jungjee.github.io/spoofceleb. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.17285v1-abstract-full').style.display = 'none'; document.getElementById('2409.17285v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, 2 figures, 8 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.15897">arXiv:2409.15897</a> <span> [<a href="https://arxiv.org/pdf/2409.15897">pdf</a>, <a href="https://arxiv.org/ps/2409.15897">ps</a>, <a href="https://arxiv.org/format/2409.15897">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> ESPnet-Codec: Comprehensive Training and Evaluation of Neural Codecs for Audio, Music, and Speech </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Shi%2C+J">Jiatong Shi</a>, <a href="/search/eess?searchtype=author&query=Tian%2C+J">Jinchuan Tian</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+Y">Yihan Wu</a>, <a href="/search/eess?searchtype=author&query=Jung%2C+J">Jee-weon Jung</a>, <a href="/search/eess?searchtype=author&query=Yip%2C+J+Q">Jia Qi Yip</a>, <a href="/search/eess?searchtype=author&query=Masuyama%2C+Y">Yoshiki Masuyama</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+W">William Chen</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+Y">Yuning Wu</a>, <a href="/search/eess?searchtype=author&query=Tang%2C+Y">Yuxun Tang</a>, <a href="/search/eess?searchtype=author&query=Baali%2C+M">Massa Baali</a>, <a href="/search/eess?searchtype=author&query=Alharhi%2C+D">Dareen Alharhi</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+D">Dong Zhang</a>, <a href="/search/eess?searchtype=author&query=Deng%2C+R">Ruifan Deng</a>, <a href="/search/eess?searchtype=author&query=Srivastava%2C+T">Tejes Srivastava</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+H">Haibin Wu</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+A+H">Alexander H. Liu</a>, <a href="/search/eess?searchtype=author&query=Raj%2C+B">Bhiksha Raj</a>, <a href="/search/eess?searchtype=author&query=Jin%2C+Q">Qin Jin</a>, <a href="/search/eess?searchtype=author&query=Song%2C+R">Ruihua Song</a>, <a href="/search/eess?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.15897v1-abstract-short" style="display: inline;"> Neural codecs have become crucial to recent speech and audio generation research. In addition to signal compression capabilities, discrete codecs have also been found to enhance downstream training efficiency and compatibility with autoregressive language models. However, as extensive downstream applications are investigated, challenges have arisen in ensuring fair comparisons across diverse appli… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15897v1-abstract-full').style.display = 'inline'; document.getElementById('2409.15897v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.15897v1-abstract-full" style="display: none;"> Neural codecs have become crucial to recent speech and audio generation research. In addition to signal compression capabilities, discrete codecs have also been found to enhance downstream training efficiency and compatibility with autoregressive language models. However, as extensive downstream applications are investigated, challenges have arisen in ensuring fair comparisons across diverse applications. To address these issues, we present a new open-source platform ESPnet-Codec, which is built on ESPnet and focuses on neural codec training and evaluation. ESPnet-Codec offers various recipes in audio, music, and speech for training and evaluation using several widely adopted codec models. Together with ESPnet-Codec, we present VERSA, a standalone evaluation toolkit, which provides a comprehensive evaluation of codec performance over 20 audio evaluation metrics. Notably, we demonstrate that ESPnet-Codec can be integrated into six ESPnet tasks, supporting diverse applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15897v1-abstract-full').style.display = 'none'; document.getElementById('2409.15897v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by SLT</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.15732">arXiv:2409.15732</a> <span> [<a href="https://arxiv.org/pdf/2409.15732">pdf</a>, <a href="https://arxiv.org/format/2409.15732">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Hypothesis Clustering and Merging: Novel MultiTalker Speech Recognition with Speaker Tokens </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Kashiwagi%2C+Y">Yosuke Kashiwagi</a>, <a href="/search/eess?searchtype=author&query=Futami%2C+H">Hayato Futami</a>, <a href="/search/eess?searchtype=author&query=Tsunoo%2C+E">Emiru Tsunoo</a>, <a href="/search/eess?searchtype=author&query=Arora%2C+S">Siddhant Arora</a>, <a href="/search/eess?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.15732v1-abstract-short" style="display: inline;"> In many real-world scenarios, such as meetings, multiple speakers are present with an unknown number of participants, and their utterances often overlap. We address these multi-speaker challenges by a novel attention-based encoder-decoder method augmented with special speaker class tokens obtained by speaker clustering. During inference, we select multiple recognition hypotheses conditioned on pre… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15732v1-abstract-full').style.display = 'inline'; document.getElementById('2409.15732v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.15732v1-abstract-full" style="display: none;"> In many real-world scenarios, such as meetings, multiple speakers are present with an unknown number of participants, and their utterances often overlap. We address these multi-speaker challenges by a novel attention-based encoder-decoder method augmented with special speaker class tokens obtained by speaker clustering. During inference, we select multiple recognition hypotheses conditioned on predicted speaker cluster tokens, and these hypotheses are merged by agglomerative hierarchical clustering (AHC) based on the normalized edit distance. The clustered hypotheses result in the multi-speaker transcriptions with the appropriate number of speakers determined by AHC. Our experiments on the LibriMix dataset demonstrate that our proposed method was particularly effective in complex 3-mix environments, achieving a 55% relative error reduction on clean data and a 36% relative error reduction on noisy data compared with conventional serialized output training. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15732v1-abstract-full').style.display = 'none'; document.getElementById('2409.15732v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to ICASSP 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.14085">arXiv:2409.14085</a> <span> [<a href="https://arxiv.org/pdf/2409.14085">pdf</a>, <a href="https://arxiv.org/format/2409.14085">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Codec-SUPERB @ SLT 2024: A lightweight benchmark for neural audio codec models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wu%2C+H">Haibin Wu</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+X">Xuanjun Chen</a>, <a href="/search/eess?searchtype=author&query=Lin%2C+Y">Yi-Cheng Lin</a>, <a href="/search/eess?searchtype=author&query=Chang%2C+K">Kaiwei Chang</a>, <a href="/search/eess?searchtype=author&query=Du%2C+J">Jiawei Du</a>, <a href="/search/eess?searchtype=author&query=Lu%2C+K">Ke-Han Lu</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+A+H">Alexander H. Liu</a>, <a href="/search/eess?searchtype=author&query=Chung%2C+H">Ho-Lam Chung</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+Y">Yuan-Kuei Wu</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+D">Dongchao Yang</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+S">Songxiang Liu</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+Y">Yi-Chiao Wu</a>, <a href="/search/eess?searchtype=author&query=Tan%2C+X">Xu Tan</a>, <a href="/search/eess?searchtype=author&query=Glass%2C+J">James Glass</a>, <a href="/search/eess?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a>, <a href="/search/eess?searchtype=author&query=Lee%2C+H">Hung-yi Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.14085v1-abstract-short" style="display: inline;"> Neural audio codec models are becoming increasingly important as they serve as tokenizers for audio, enabling efficient transmission or facilitating speech language modeling. The ideal neural audio codec should maintain content, paralinguistics, speaker characteristics, and audio information even at low bitrates. Recently, numerous advanced neural codec models have been proposed. However, codec mo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14085v1-abstract-full').style.display = 'inline'; document.getElementById('2409.14085v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.14085v1-abstract-full" style="display: none;"> Neural audio codec models are becoming increasingly important as they serve as tokenizers for audio, enabling efficient transmission or facilitating speech language modeling. The ideal neural audio codec should maintain content, paralinguistics, speaker characteristics, and audio information even at low bitrates. Recently, numerous advanced neural codec models have been proposed. However, codec models are often tested under varying experimental conditions. As a result, we introduce the Codec-SUPERB challenge at SLT 2024, designed to facilitate fair and lightweight comparisons among existing codec models and inspire advancements in the field. This challenge brings together representative speech applications and objective metrics, and carefully selects license-free datasets, sampling them into small sets to reduce evaluation computation costs. This paper presents the challenge's rules, datasets, five participant systems, results, and findings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14085v1-abstract-full').style.display = 'none'; document.getElementById('2409.14085v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.12370">arXiv:2409.12370</a> <span> [<a href="https://arxiv.org/pdf/2409.12370">pdf</a>, <a href="https://arxiv.org/format/2409.12370">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Robust Audiovisual Speech Recognition Models with Mixture-of-Experts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wu%2C+Y">Yihan Wu</a>, <a href="/search/eess?searchtype=author&query=Peng%2C+Y">Yifan Peng</a>, <a href="/search/eess?searchtype=author&query=Lu%2C+Y">Yichen Lu</a>, <a href="/search/eess?searchtype=author&query=Chang%2C+X">Xuankai Chang</a>, <a href="/search/eess?searchtype=author&query=Song%2C+R">Ruihua Song</a>, <a href="/search/eess?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.12370v1-abstract-short" style="display: inline;"> Visual signals can enhance audiovisual speech recognition accuracy by providing additional contextual information. Given the complexity of visual signals, an audiovisual speech recognition model requires robust generalization capabilities across diverse video scenarios, presenting a significant challenge. In this paper, we introduce EVA, leveraging the mixture-of-Experts for audioVisual ASR to per… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12370v1-abstract-full').style.display = 'inline'; document.getElementById('2409.12370v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.12370v1-abstract-full" style="display: none;"> Visual signals can enhance audiovisual speech recognition accuracy by providing additional contextual information. Given the complexity of visual signals, an audiovisual speech recognition model requires robust generalization capabilities across diverse video scenarios, presenting a significant challenge. In this paper, we introduce EVA, leveraging the mixture-of-Experts for audioVisual ASR to perform robust speech recognition for ``in-the-wild'' videos. Specifically, we first encode visual information into visual tokens sequence and map them into speech space by a lightweight projection. Then, we build EVA upon a robust pretrained speech recognition model, ensuring its generalization ability. Moreover, to incorporate visual information effectively, we inject visual information into the ASR model through a mixture-of-experts module. Experiments show our model achieves state-of-the-art results on three benchmarks, which demonstrates the generalization ability of EVA across diverse video domains. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12370v1-abstract-full').style.display = 'none'; document.getElementById('2409.12370v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">6 pages, 2 figures, accepted by IEEE Spoken Language Technology Workshop 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.10791">arXiv:2409.10791</a> <span> [<a href="https://arxiv.org/pdf/2409.10791">pdf</a>, <a href="https://arxiv.org/format/2409.10791">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Speaker-IPL: Unsupervised Learning of Speaker Characteristics with i-Vector based Pseudo-Labels </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Aldeneh%2C+Z">Zakaria Aldeneh</a>, <a href="/search/eess?searchtype=author&query=Higuchi%2C+T">Takuya Higuchi</a>, <a href="/search/eess?searchtype=author&query=Jung%2C+J">Jee-weon Jung</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+L">Li-Wei Chen</a>, <a href="/search/eess?searchtype=author&query=Shum%2C+S">Stephen Shum</a>, <a href="/search/eess?searchtype=author&query=Abdelaziz%2C+A+H">Ahmed Hussen Abdelaziz</a>, <a href="/search/eess?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a>, <a href="/search/eess?searchtype=author&query=Likhomanenko%2C+T">Tatiana Likhomanenko</a>, <a href="/search/eess?searchtype=author&query=Theobald%2C+B">Barry-John Theobald</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.10791v1-abstract-short" style="display: inline;"> Iterative self-training, or iterative pseudo-labeling (IPL)--using an improved model from the current iteration to provide pseudo-labels for the next iteration--has proven to be a powerful approach to enhance the quality of speaker representations. Recent applications of IPL in unsupervised speaker recognition start with representations extracted from very elaborate self-supervised methods (e.g.,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10791v1-abstract-full').style.display = 'inline'; document.getElementById('2409.10791v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.10791v1-abstract-full" style="display: none;"> Iterative self-training, or iterative pseudo-labeling (IPL)--using an improved model from the current iteration to provide pseudo-labels for the next iteration--has proven to be a powerful approach to enhance the quality of speaker representations. Recent applications of IPL in unsupervised speaker recognition start with representations extracted from very elaborate self-supervised methods (e.g., DINO). However, training such strong self-supervised models is not straightforward (they require hyper-parameters tuning and may not generalize to out-of-domain data) and, moreover, may not be needed at all. To this end, we show the simple, well-studied, and established i-vector generative model is enough to bootstrap the IPL process for unsupervised learning of speaker representations. We also systematically study the impact of other components on the IPL process, which includes the initial model, the encoder, augmentations, the number of clusters, and the clustering algorithm. Remarkably, we find that even with a simple and significantly weaker initial model like i-vector, IPL can still achieve speaker verification performance that rivals state-of-the-art methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10791v1-abstract-full').style.display = 'none'; document.getElementById('2409.10791v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to ICASSP 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.10788">arXiv:2409.10788</a> <span> [<a href="https://arxiv.org/pdf/2409.10788">pdf</a>, <a href="https://arxiv.org/format/2409.10788">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Exploring Prediction Targets in Masked Pre-Training for Speech Foundation Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Chen%2C+L">Li-Wei Chen</a>, <a href="/search/eess?searchtype=author&query=Higuchi%2C+T">Takuya Higuchi</a>, <a href="/search/eess?searchtype=author&query=Bai%2C+H">He Bai</a>, <a href="/search/eess?searchtype=author&query=Abdelaziz%2C+A+H">Ahmed Hussen Abdelaziz</a>, <a href="/search/eess?searchtype=author&query=Rudnicky%2C+A">Alexander Rudnicky</a>, <a href="/search/eess?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a>, <a href="/search/eess?searchtype=author&query=Likhomanenko%2C+T">Tatiana Likhomanenko</a>, <a href="/search/eess?searchtype=author&query=Theobald%2C+B">Barry-John Theobald</a>, <a href="/search/eess?searchtype=author&query=Aldeneh%2C+Z">Zakaria Aldeneh</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.10788v1-abstract-short" style="display: inline;"> Speech foundation models, such as HuBERT and its variants, are pre-trained on large amounts of unlabeled speech for various downstream tasks. These models use a masked prediction objective, where the model learns to predict information about masked input segments from the unmasked context. The choice of prediction targets in this framework can influence performance on downstream tasks. For example… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10788v1-abstract-full').style.display = 'inline'; document.getElementById('2409.10788v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.10788v1-abstract-full" style="display: none;"> Speech foundation models, such as HuBERT and its variants, are pre-trained on large amounts of unlabeled speech for various downstream tasks. These models use a masked prediction objective, where the model learns to predict information about masked input segments from the unmasked context. The choice of prediction targets in this framework can influence performance on downstream tasks. For example, targets that encode prosody are beneficial for speaker-related tasks, while targets that encode phonetics are more suited for content-related tasks. Additionally, prediction targets can vary in the level of detail they encode; targets that encode fine-grained acoustic details are beneficial for denoising tasks, while targets that encode higher-level abstractions are more suited for content-related tasks. Despite the importance of prediction targets, the design choices that affect them have not been thoroughly studied. This work explores the design choices and their impact on downstream task performance. Our results indicate that the commonly used design choices for HuBERT can be suboptimal. We propose novel approaches to create more informative prediction targets and demonstrate their effectiveness through improvements across various downstream tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10788v1-abstract-full').style.display = 'none'; document.getElementById('2409.10788v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to ICASSP 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.09785">arXiv:2409.09785</a> <span> [<a href="https://arxiv.org/pdf/2409.09785">pdf</a>, <a href="https://arxiv.org/format/2409.09785">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Large Language Model Based Generative Error Correction: A Challenge and Baselines for Speech Recognition, Speaker Tagging, and Emotion Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yang%2C+C+H">Chao-Han Huck Yang</a>, <a href="/search/eess?searchtype=author&query=Park%2C+T">Taejin Park</a>, <a href="/search/eess?searchtype=author&query=Gong%2C+Y">Yuan Gong</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yuanchao Li</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Z">Zhehuai Chen</a>, <a href="/search/eess?searchtype=author&query=Lin%2C+Y">Yen-Ting Lin</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+C">Chen Chen</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+Y">Yuchen Hu</a>, <a href="/search/eess?searchtype=author&query=Dhawan%2C+K">Kunal Dhawan</a>, <a href="/search/eess?searchtype=author&query=%C5%BBelasko%2C+P">Piotr 呕elasko</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+C">Chao Zhang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yun-Nung Chen</a>, <a href="/search/eess?searchtype=author&query=Tsao%2C+Y">Yu Tsao</a>, <a href="/search/eess?searchtype=author&query=Balam%2C+J">Jagadeesh Balam</a>, <a href="/search/eess?searchtype=author&query=Ginsburg%2C+B">Boris Ginsburg</a>, <a href="/search/eess?searchtype=author&query=Siniscalchi%2C+S+M">Sabato Marco Siniscalchi</a>, <a href="/search/eess?searchtype=author&query=Chng%2C+E+S">Eng Siong Chng</a>, <a href="/search/eess?searchtype=author&query=Bell%2C+P">Peter Bell</a>, <a href="/search/eess?searchtype=author&query=Lai%2C+C">Catherine Lai</a>, <a href="/search/eess?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a>, <a href="/search/eess?searchtype=author&query=Stolcke%2C+A">Andreas Stolcke</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.09785v3-abstract-short" style="display: inline;"> Given recent advances in generative AI technology, a key question is how large language models (LLMs) can enhance acoustic modeling tasks using text decoding results from a frozen, pretrained automatic speech recognition (ASR) model. To explore new capabilities in language modeling for speech processing, we introduce the generative speech transcription error correction (GenSEC) challenge. This cha… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09785v3-abstract-full').style.display = 'inline'; document.getElementById('2409.09785v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.09785v3-abstract-full" style="display: none;"> Given recent advances in generative AI technology, a key question is how large language models (LLMs) can enhance acoustic modeling tasks using text decoding results from a frozen, pretrained automatic speech recognition (ASR) model. To explore new capabilities in language modeling for speech processing, we introduce the generative speech transcription error correction (GenSEC) challenge. This challenge comprises three post-ASR language modeling tasks: (i) post-ASR transcription correction, (ii) speaker tagging, and (iii) emotion recognition. These tasks aim to emulate future LLM-based agents handling voice-based interfaces while remaining accessible to a broad audience by utilizing open pretrained language models or agent-based APIs. We also discuss insights from baseline evaluations, as well as lessons learned for designing future evaluations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09785v3-abstract-full').style.display = 'none'; document.getElementById('2409.09785v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">IEEE SLT 2024. The initial draft version has been done in December 2023. Post-ASR Text Processing and Understanding Community and LlaMA-7B pre-training correction model: https://huggingface.co/GenSEC-LLM/SLT-Task1-Llama2-7b-HyPo-baseline</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.09506">arXiv:2409.09506</a> <span> [<a href="https://arxiv.org/pdf/2409.09506">pdf</a>, <a href="https://arxiv.org/format/2409.09506">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> ESPnet-EZ: Python-only ESPnet for Easy Fine-tuning and Integration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Someki%2C+M">Masao Someki</a>, <a href="/search/eess?searchtype=author&query=Choi%2C+K">Kwanghee Choi</a>, <a href="/search/eess?searchtype=author&query=Arora%2C+S">Siddhant Arora</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+W">William Chen</a>, <a href="/search/eess?searchtype=author&query=Cornell%2C+S">Samuele Cornell</a>, <a href="/search/eess?searchtype=author&query=Han%2C+J">Jionghao Han</a>, <a href="/search/eess?searchtype=author&query=Peng%2C+Y">Yifan Peng</a>, <a href="/search/eess?searchtype=author&query=Shi%2C+J">Jiatong Shi</a>, <a href="/search/eess?searchtype=author&query=Srivastav%2C+V">Vaibhav Srivastav</a>, <a href="/search/eess?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.09506v1-abstract-short" style="display: inline;"> We introduce ESPnet-EZ, an extension of the open-source speech processing toolkit ESPnet, aimed at quick and easy development of speech models. ESPnet-EZ focuses on two major aspects: (i) easy fine-tuning and inference of existing ESPnet models on various tasks and (ii) easy integration with popular deep neural network frameworks such as PyTorch-Lightning, Hugging Face transformers and datasets, a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09506v1-abstract-full').style.display = 'inline'; document.getElementById('2409.09506v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.09506v1-abstract-full" style="display: none;"> We introduce ESPnet-EZ, an extension of the open-source speech processing toolkit ESPnet, aimed at quick and easy development of speech models. ESPnet-EZ focuses on two major aspects: (i) easy fine-tuning and inference of existing ESPnet models on various tasks and (ii) easy integration with popular deep neural network frameworks such as PyTorch-Lightning, Hugging Face transformers and datasets, and Lhotse. By replacing ESPnet design choices inherited from Kaldi with a Python-only, Bash-free interface, we dramatically reduce the effort required to build, debug, and use a new model. For example, to fine-tune a speech foundation model, ESPnet-EZ, compared to ESPnet, reduces the number of newly written code by 2.7x and the amount of dependent code by 6.7x while dramatically reducing the Bash script dependencies. The codebase of ESPnet-EZ is publicly available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09506v1-abstract-full').style.display = 'none'; document.getElementById('2409.09506v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to SLT 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.08711">arXiv:2409.08711</a> <span> [<a href="https://arxiv.org/pdf/2409.08711">pdf</a>, <a href="https://arxiv.org/format/2409.08711">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Text-To-Speech Synthesis In The Wild </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Jung%2C+J">Jee-weon Jung</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+W">Wangyou Zhang</a>, <a href="/search/eess?searchtype=author&query=Maiti%2C+S">Soumi Maiti</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+Y">Yihan Wu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+X">Xin Wang</a>, <a href="/search/eess?searchtype=author&query=Kim%2C+J">Ji-Hoon Kim</a>, <a href="/search/eess?searchtype=author&query=Matsunaga%2C+Y">Yuta Matsunaga</a>, <a href="/search/eess?searchtype=author&query=Um%2C+S">Seyun Um</a>, <a href="/search/eess?searchtype=author&query=Tian%2C+J">Jinchuan Tian</a>, <a href="/search/eess?searchtype=author&query=Shim%2C+H">Hye-jin Shim</a>, <a href="/search/eess?searchtype=author&query=Evans%2C+N">Nicholas Evans</a>, <a href="/search/eess?searchtype=author&query=Chung%2C+J+S">Joon Son Chung</a>, <a href="/search/eess?searchtype=author&query=Takamichi%2C+S">Shinnosuke Takamichi</a>, <a href="/search/eess?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.08711v1-abstract-short" style="display: inline;"> Text-to-speech (TTS) systems are traditionally trained using modest databases of studio-quality, prompted or read speech collected in benign acoustic environments such as anechoic rooms. The recent literature nonetheless shows efforts to train TTS systems using data collected in the wild. While this approach allows for the use of massive quantities of natural speech, until now, there are no common… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08711v1-abstract-full').style.display = 'inline'; document.getElementById('2409.08711v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.08711v1-abstract-full" style="display: none;"> Text-to-speech (TTS) systems are traditionally trained using modest databases of studio-quality, prompted or read speech collected in benign acoustic environments such as anechoic rooms. The recent literature nonetheless shows efforts to train TTS systems using data collected in the wild. While this approach allows for the use of massive quantities of natural speech, until now, there are no common datasets. We introduce the TTS In the Wild (TITW) dataset, the result of a fully automated pipeline, in this case, applied to the VoxCeleb1 dataset commonly used for speaker recognition. We further propose two training sets. TITW-Hard is derived from the transcription, segmentation, and selection of VoxCeleb1 source data. TITW-Easy is derived from the additional application of enhancement and additional data selection based on DNSMOS. We show that a number of recent TTS models can be trained successfully using TITW-Easy, but that it remains extremely challenging to produce similar results using TITW-Hard. Both the dataset and protocols are publicly available and support the benchmarking of TTS systems trained using TITW data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08711v1-abstract-full').style.display = 'none'; document.getElementById('2409.08711v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, submitted to ICASSP 2025 as a conference paper</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.07226">arXiv:2409.07226</a> <span> [<a href="https://arxiv.org/pdf/2409.07226">pdf</a>, <a href="https://arxiv.org/format/2409.07226">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Muskits-ESPnet: A Comprehensive Toolkit for Singing Voice Synthesis in New Paradigm </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wu%2C+Y">Yuning Wu</a>, <a href="/search/eess?searchtype=author&query=Shi%2C+J">Jiatong Shi</a>, <a href="/search/eess?searchtype=author&query=Yu%2C+Y">Yifeng Yu</a>, <a href="/search/eess?searchtype=author&query=Tang%2C+Y">Yuxun Tang</a>, <a href="/search/eess?searchtype=author&query=Qian%2C+T">Tao Qian</a>, <a href="/search/eess?searchtype=author&query=Lin%2C+Y">Yueqian Lin</a>, <a href="/search/eess?searchtype=author&query=Han%2C+J">Jionghao Han</a>, <a href="/search/eess?searchtype=author&query=Bai%2C+X">Xinyi Bai</a>, <a href="/search/eess?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a>, <a href="/search/eess?searchtype=author&query=Jin%2C+Q">Qin Jin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.07226v2-abstract-short" style="display: inline;"> This research presents Muskits-ESPnet, a versatile toolkit that introduces new paradigms to Singing Voice Synthesis (SVS) through the application of pretrained audio models in both continuous and discrete approaches. Specifically, we explore discrete representations derived from SSL models and audio codecs and offer significant advantages in versatility and intelligence, supporting multi-format in… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.07226v2-abstract-full').style.display = 'inline'; document.getElementById('2409.07226v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.07226v2-abstract-full" style="display: none;"> This research presents Muskits-ESPnet, a versatile toolkit that introduces new paradigms to Singing Voice Synthesis (SVS) through the application of pretrained audio models in both continuous and discrete approaches. Specifically, we explore discrete representations derived from SSL models and audio codecs and offer significant advantages in versatility and intelligence, supporting multi-format inputs and adaptable data processing workflows for various SVS models. The toolkit features automatic music score error detection and correction, as well as a perception auto-evaluation module to imitate human subjective evaluating scores. Muskits-ESPnet is available at \url{https://github.com/espnet/espnet}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.07226v2-abstract-full').style.display = 'none'; document.getElementById('2409.07226v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ACMMM 2024 demo track</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.09215">arXiv:2408.09215</a> <span> [<a href="https://arxiv.org/pdf/2408.09215">pdf</a>, <a href="https://arxiv.org/format/2408.09215">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Generating Data with Text-to-Speech and Large-Language Models for Conversational Speech Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Cornell%2C+S">Samuele Cornell</a>, <a href="/search/eess?searchtype=author&query=Darefsky%2C+J">Jordan Darefsky</a>, <a href="/search/eess?searchtype=author&query=Duan%2C+Z">Zhiyao Duan</a>, <a href="/search/eess?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.09215v1-abstract-short" style="display: inline;"> Currently, a common approach in many speech processing tasks is to leverage large scale pre-trained models by fine-tuning them on in-domain data for a particular application. Yet obtaining even a small amount of such data can be problematic, especially for sensitive domains and conversational speech scenarios, due to both privacy issues and annotation costs. To address this, synthetic data generat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.09215v1-abstract-full').style.display = 'inline'; document.getElementById('2408.09215v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.09215v1-abstract-full" style="display: none;"> Currently, a common approach in many speech processing tasks is to leverage large scale pre-trained models by fine-tuning them on in-domain data for a particular application. Yet obtaining even a small amount of such data can be problematic, especially for sensitive domains and conversational speech scenarios, due to both privacy issues and annotation costs. To address this, synthetic data generation using single speaker datasets has been employed. Yet, for multi-speaker cases, such an approach often requires extensive manual effort and is prone to domain mismatches. In this work, we propose a synthetic data generation pipeline for multi-speaker conversational ASR, leveraging a large language model (LLM) for content creation and a conversational multi-speaker text-to-speech (TTS) model for speech synthesis. We conduct evaluation by fine-tuning the Whisper ASR model for telephone and distant conversational speech settings, using both in-domain data and generated synthetic data. Our results show that the proposed method is able to significantly outperform classical multi-speaker generation approaches that use external, non-conversational speech datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.09215v1-abstract-full').style.display = 'none'; document.getElementById('2408.09215v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To appear at SynData4GenAI 2024 workshop</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.00624">arXiv:2408.00624</a> <span> [<a href="https://arxiv.org/pdf/2408.00624">pdf</a>, <a href="https://arxiv.org/format/2408.00624">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SynesLM: A Unified Approach for Audio-visual Speech Recognition and Translation via Language Model and Synthetic Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Lu%2C+Y">Yichen Lu</a>, <a href="/search/eess?searchtype=author&query=Song%2C+J">Jiaqi Song</a>, <a href="/search/eess?searchtype=author&query=Chang%2C+X">Xuankai Chang</a>, <a href="/search/eess?searchtype=author&query=Bian%2C+H">Hengwei Bian</a>, <a href="/search/eess?searchtype=author&query=Maiti%2C+S">Soumi Maiti</a>, <a href="/search/eess?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.00624v1-abstract-short" style="display: inline;"> In this work, we present SynesLM, an unified model which can perform three multimodal language understanding tasks: audio-visual automatic speech recognition(AV-ASR) and visual-aided speech/machine translation(VST/VMT). Unlike previous research that focused on lip motion as visual cues for speech signals, our work explores more general visual information within entire frames, such as objects and a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.00624v1-abstract-full').style.display = 'inline'; document.getElementById('2408.00624v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.00624v1-abstract-full" style="display: none;"> In this work, we present SynesLM, an unified model which can perform three multimodal language understanding tasks: audio-visual automatic speech recognition(AV-ASR) and visual-aided speech/machine translation(VST/VMT). Unlike previous research that focused on lip motion as visual cues for speech signals, our work explores more general visual information within entire frames, such as objects and actions. Additionally, we use synthetic image data to enhance the correlation between image and speech data. We benchmark SynesLM against the How2 dataset, demonstrating performance on par with state-of-the-art (SOTA) models dedicated to AV-ASR while maintaining our multitasking framework. Remarkably, for zero-shot AV-ASR, SynesLM achieved SOTA performance by lowering the Word Error Rate (WER) from 43.4% to 39.4% on the VisSpeech Dataset. Furthermore, our results in VST and VMT outperform the previous results, improving the BLEU score to 43.5 from 37.2 for VST, and to 54.8 from 54.4 for VMT. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.00624v1-abstract-full').style.display = 'none'; document.getElementById('2408.00624v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.16447">arXiv:2407.16447</a> <span> [<a href="https://arxiv.org/pdf/2407.16447">pdf</a>, <a href="https://arxiv.org/ps/2407.16447">ps</a>, <a href="https://arxiv.org/format/2407.16447">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> The CHiME-8 DASR Challenge for Generalizable and Array Agnostic Distant Automatic Speech Recognition and Diarization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Cornell%2C+S">Samuele Cornell</a>, <a href="/search/eess?searchtype=author&query=Park%2C+T">Taejin Park</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+S">Steve Huang</a>, <a href="/search/eess?searchtype=author&query=Boeddeker%2C+C">Christoph Boeddeker</a>, <a href="/search/eess?searchtype=author&query=Chang%2C+X">Xuankai Chang</a>, <a href="/search/eess?searchtype=author&query=Maciejewski%2C+M">Matthew Maciejewski</a>, <a href="/search/eess?searchtype=author&query=Wiesner%2C+M">Matthew Wiesner</a>, <a href="/search/eess?searchtype=author&query=Garcia%2C+P">Paola Garcia</a>, <a href="/search/eess?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.16447v1-abstract-short" style="display: inline;"> This paper presents the CHiME-8 DASR challenge which carries on from the previous edition CHiME-7 DASR (C7DASR) and the past CHiME-6 challenge. It focuses on joint multi-channel distant speech recognition (DASR) and diarization with one or more, possibly heterogeneous, devices. The main goal is to spur research towards meeting transcription approaches that can generalize across arbitrary number of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.16447v1-abstract-full').style.display = 'inline'; document.getElementById('2407.16447v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.16447v1-abstract-full" style="display: none;"> This paper presents the CHiME-8 DASR challenge which carries on from the previous edition CHiME-7 DASR (C7DASR) and the past CHiME-6 challenge. It focuses on joint multi-channel distant speech recognition (DASR) and diarization with one or more, possibly heterogeneous, devices. The main goal is to spur research towards meeting transcription approaches that can generalize across arbitrary number of speakers, diverse settings (formal vs. informal conversations), meeting duration, wide-variety of acoustic scenarios and different recording configurations. Novelties with respect to C7DASR include: i) the addition of NOTSOFAR-1, an additional office/corporate meeting scenario, ii) a manually corrected Mixer 6 development set, iii) a new track in which we allow the use of large-language models (LLM) iv) a jury award mechanism to encourage participants to explore also more practical and innovative solutions. To lower the entry barrier for participants, we provide a standalone toolkit for downloading and preparing such datasets as well as performing text normalization and scoring their submissions. Furthermore, this year we also provide two baseline systems, one directly inherited from C7DASR and based on ESPnet and another one developed on NeMo and based on NeMo team submission in last year C7DASR. Baseline system results suggest that the addition of the NOTSOFAR-1 scenario significantly increases the task's difficulty due to its high number of speakers and very short duration. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.16447v1-abstract-full').style.display = 'none'; document.getElementById('2407.16447v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.03718">arXiv:2407.03718</a> <span> [<a href="https://arxiv.org/pdf/2407.03718">pdf</a>, <a href="https://arxiv.org/format/2407.03718">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Multi-Convformer: Extending Conformer with Multiple Convolution Kernels </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Prabhu%2C+D">Darshan Prabhu</a>, <a href="/search/eess?searchtype=author&query=Peng%2C+Y">Yifan Peng</a>, <a href="/search/eess?searchtype=author&query=Jyothi%2C+P">Preethi Jyothi</a>, <a href="/search/eess?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.03718v2-abstract-short" style="display: inline;"> Convolutions have become essential in state-of-the-art end-to-end Automatic Speech Recognition~(ASR) systems due to their efficient modelling of local context. Notably, its use in Conformers has led to superior performance compared to vanilla Transformer-based ASR systems. While components other than the convolution module in the Conformer have been reexamined, altering the convolution module itse… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.03718v2-abstract-full').style.display = 'inline'; document.getElementById('2407.03718v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.03718v2-abstract-full" style="display: none;"> Convolutions have become essential in state-of-the-art end-to-end Automatic Speech Recognition~(ASR) systems due to their efficient modelling of local context. Notably, its use in Conformers has led to superior performance compared to vanilla Transformer-based ASR systems. While components other than the convolution module in the Conformer have been reexamined, altering the convolution module itself has been far less explored. Towards this, we introduce Multi-Convformer that uses multiple convolution kernels within the convolution module of the Conformer in conjunction with gating. This helps in improved modeling of local dependencies at varying granularities. Our model rivals existing Conformer variants such as CgMLP and E-Branchformer in performance, while being more parameter efficient. We empirically compare our approach with Conformer and its variants across four different datasets and three different modelling paradigms and show up to 8% relative word error rate~(WER) improvements. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.03718v2-abstract-full').style.display = 'none'; document.getElementById('2407.03718v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to INTERSPEECH 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.00837">arXiv:2407.00837</a> <span> [<a href="https://arxiv.org/pdf/2407.00837">pdf</a>, <a href="https://arxiv.org/format/2407.00837">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Towards Robust Speech Representation Learning for Thousands of Languages </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Chen%2C+W">William Chen</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+W">Wangyou Zhang</a>, <a href="/search/eess?searchtype=author&query=Peng%2C+Y">Yifan Peng</a>, <a href="/search/eess?searchtype=author&query=Li%2C+X">Xinjian Li</a>, <a href="/search/eess?searchtype=author&query=Tian%2C+J">Jinchuan Tian</a>, <a href="/search/eess?searchtype=author&query=Shi%2C+J">Jiatong Shi</a>, <a href="/search/eess?searchtype=author&query=Chang%2C+X">Xuankai Chang</a>, <a href="/search/eess?searchtype=author&query=Maiti%2C+S">Soumi Maiti</a>, <a href="/search/eess?searchtype=author&query=Livescu%2C+K">Karen Livescu</a>, <a href="/search/eess?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.00837v2-abstract-short" style="display: inline;"> Self-supervised learning (SSL) has helped extend speech technologies to more languages by reducing the need for labeled data. However, models are still far from supporting the world's 7000+ languages. We propose XEUS, a Cross-lingual Encoder for Universal Speech, trained on over 1 million hours of data across 4057 languages, extending the language coverage of SSL models 4-fold. We combine 1 millio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.00837v2-abstract-full').style.display = 'inline'; document.getElementById('2407.00837v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.00837v2-abstract-full" style="display: none;"> Self-supervised learning (SSL) has helped extend speech technologies to more languages by reducing the need for labeled data. However, models are still far from supporting the world's 7000+ languages. We propose XEUS, a Cross-lingual Encoder for Universal Speech, trained on over 1 million hours of data across 4057 languages, extending the language coverage of SSL models 4-fold. We combine 1 million hours of speech from existing publicly accessible corpora with a newly created corpus of 7400+ hours from 4057 languages, which will be publicly released. To handle the diverse conditions of multilingual speech data, we augment the typical SSL masked prediction approach with a novel dereverberation objective, increasing robustness. We evaluate XEUS on several benchmarks, and show that it consistently outperforms or achieves comparable results to state-of-the-art (SOTA) SSL models across a variety of tasks. XEUS sets a new SOTA on the ML-SUPERB benchmark: it outperforms MMS 1B and w2v-BERT 2.0 v2 by 0.8% and 4.4% respectively, despite having less parameters or pre-training data. Checkpoints, code, and data are found in https://www.wavlab.org/activities/2024/xeus/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.00837v2-abstract-full').style.display = 'none'; document.getElementById('2407.00837v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Updated affiliations; 20 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.17246">arXiv:2406.17246</a> <span> [<a href="https://arxiv.org/pdf/2406.17246">pdf</a>, <a href="https://arxiv.org/format/2406.17246">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Beyond Silence: Bias Analysis through Loss and Asymmetric Approach in Audio Anti-Spoofing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Shim%2C+H">Hye-jin Shim</a>, <a href="/search/eess?searchtype=author&query=Sahidullah%2C+M">Md Sahidullah</a>, <a href="/search/eess?searchtype=author&query=Jung%2C+J">Jee-weon Jung</a>, <a href="/search/eess?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a>, <a href="/search/eess?searchtype=author&query=Kinnunen%2C+T">Tomi Kinnunen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.17246v2-abstract-short" style="display: inline;"> Current trends in audio anti-spoofing detection research strive to improve models' ability to generalize across unseen attacks by learning to identify a variety of spoofing artifacts. This emphasis has primarily focused on the spoof class. Recently, several studies have noted that the distribution of silence differs between the two classes, which can serve as a shortcut. In this paper, we extend c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.17246v2-abstract-full').style.display = 'inline'; document.getElementById('2406.17246v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.17246v2-abstract-full" style="display: none;"> Current trends in audio anti-spoofing detection research strive to improve models' ability to generalize across unseen attacks by learning to identify a variety of spoofing artifacts. This emphasis has primarily focused on the spoof class. Recently, several studies have noted that the distribution of silence differs between the two classes, which can serve as a shortcut. In this paper, we extend class-wise interpretations beyond silence. We employ loss analysis and asymmetric methodologies to move away from traditional attack-focused and result-oriented evaluations towards a deeper examination of model behaviors. Our investigations highlight the significant differences in training dynamics between the two classes, emphasizing the need for future research to focus on robust modeling of the bonafide class. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.17246v2-abstract-full').style.display = 'none'; document.getElementById('2406.17246v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 1 figure, 5 tables, ISCA Interspeech 2024 SynData4GenAI Workshop</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.16120">arXiv:2406.16120</a> <span> [<a href="https://arxiv.org/pdf/2406.16120">pdf</a>, <a href="https://arxiv.org/format/2406.16120">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.21437/Interspeech.2024-1257">10.21437/Interspeech.2024-1257 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Contextualized End-to-end Automatic Speech Recognition with Intermediate Biasing Loss </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Shakeel%2C+M">Muhammad Shakeel</a>, <a href="/search/eess?searchtype=author&query=Sudo%2C+Y">Yui Sudo</a>, <a href="/search/eess?searchtype=author&query=Peng%2C+Y">Yifan Peng</a>, <a href="/search/eess?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.16120v1-abstract-short" style="display: inline;"> Contextualized end-to-end automatic speech recognition has been an active research area, with recent efforts focusing on the implicit learning of contextual phrases based on the final loss objective. However, these approaches ignore the useful contextual knowledge encoded in the intermediate layers. We hypothesize that employing explicit biasing loss as an auxiliary task in the encoder intermediat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.16120v1-abstract-full').style.display = 'inline'; document.getElementById('2406.16120v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.16120v1-abstract-full" style="display: none;"> Contextualized end-to-end automatic speech recognition has been an active research area, with recent efforts focusing on the implicit learning of contextual phrases based on the final loss objective. However, these approaches ignore the useful contextual knowledge encoded in the intermediate layers. We hypothesize that employing explicit biasing loss as an auxiliary task in the encoder intermediate layers may better align text tokens or audio frames with the desired objectives. Our proposed intermediate biasing loss brings more regularization and contextualization to the network. Our method outperforms a conventional contextual biasing baseline on the LibriSpeech corpus, achieving a relative improvement of 22.5% in biased word error rate (B-WER) and up to 44% compared to the non-contextual baseline with a biasing list size of 100. Moreover, employing RNN-transducer-driven joint decoding further reduces the unbiased word error rate (U-WER), resulting in a more robust network. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.16120v1-abstract-full').style.display = 'none'; document.getElementById('2406.16120v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to INTERSPEECH 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.16107">arXiv:2406.16107</a> <span> [<a href="https://arxiv.org/pdf/2406.16107">pdf</a>, <a href="https://arxiv.org/ps/2406.16107">ps</a>, <a href="https://arxiv.org/format/2406.16107">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Decoder-only Architecture for Streaming End-to-end Speech Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Tsunoo%2C+E">Emiru Tsunoo</a>, <a href="/search/eess?searchtype=author&query=Futami%2C+H">Hayato Futami</a>, <a href="/search/eess?searchtype=author&query=Kashiwagi%2C+Y">Yosuke Kashiwagi</a>, <a href="/search/eess?searchtype=author&query=Arora%2C+S">Siddhant Arora</a>, <a href="/search/eess?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.16107v2-abstract-short" style="display: inline;"> Decoder-only language models (LMs) have been successfully adopted for speech-processing tasks including automatic speech recognition (ASR). The LMs have ample expressiveness and perform efficiently. This efficiency is a suitable characteristic for streaming applications of ASR. In this work, we propose to use a decoder-only architecture for blockwise streaming ASR. In our approach, speech features… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.16107v2-abstract-full').style.display = 'inline'; document.getElementById('2406.16107v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.16107v2-abstract-full" style="display: none;"> Decoder-only language models (LMs) have been successfully adopted for speech-processing tasks including automatic speech recognition (ASR). The LMs have ample expressiveness and perform efficiently. This efficiency is a suitable characteristic for streaming applications of ASR. In this work, we propose to use a decoder-only architecture for blockwise streaming ASR. In our approach, speech features are compressed using CTC output and context embedding using blockwise speech subnetwork, and are sequentially provided as prompts to the decoder. The decoder estimates the output tokens promptly at each block. To this end, we also propose a novel training scheme using random-length prefix prompts to make the model robust to the truncated prompts caused by blockwise processing. An experimental comparison shows that our proposed decoder-only streaming ASR achieves 8% relative word error rate reduction in the LibriSpeech test-other set while being twice as fast as the baseline model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.16107v2-abstract-full').style.display = 'none'; document.getElementById('2406.16107v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted for Interspeech 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.13471">arXiv:2406.13471</a> <span> [<a href="https://arxiv.org/pdf/2406.13471">pdf</a>, <a href="https://arxiv.org/format/2406.13471">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Diffusion-based Generative Modeling with Discriminative Guidance for Streamable Speech Enhancement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+C">Chenda Li</a>, <a href="/search/eess?searchtype=author&query=Cornell%2C+S">Samuele Cornell</a>, <a href="/search/eess?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a>, <a href="/search/eess?searchtype=author&query=Qian%2C+Y">Yanmin Qian</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.13471v1-abstract-short" style="display: inline;"> Diffusion-based generative models (DGMs) have recently attracted attention in speech enhancement research (SE) as previous works showed a remarkable generalization capability. However, DGMs are also computationally intensive, as they usually require many iterations in the reverse diffusion process (RDP), making them impractical for streaming SE systems. In this paper, we propose to use discriminat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.13471v1-abstract-full').style.display = 'inline'; document.getElementById('2406.13471v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.13471v1-abstract-full" style="display: none;"> Diffusion-based generative models (DGMs) have recently attracted attention in speech enhancement research (SE) as previous works showed a remarkable generalization capability. However, DGMs are also computationally intensive, as they usually require many iterations in the reverse diffusion process (RDP), making them impractical for streaming SE systems. In this paper, we propose to use discriminative scores from discriminative models in the first steps of the RDP. These discriminative scores require only one forward pass with the discriminative model for multiple RDP steps, thus greatly reducing computations. This approach also allows for performance improvements. We show that we can trade off between generative and discriminative capabilities as the number of steps with the discriminative score increases. Furthermore, we propose a novel streamable time-domain generative model with an algorithmic latency of 50 ms, which has no significant performance degradation compared to offline models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.13471v1-abstract-full').style.display = 'none'; document.getElementById('2406.13471v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.12611">arXiv:2406.12611</a> <span> [<a href="https://arxiv.org/pdf/2406.12611">pdf</a>, <a href="https://arxiv.org/format/2406.12611">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Rapid Language Adaptation for Multilingual E2E Speech Recognition Using Encoder Prompting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Kashiwagi%2C+Y">Yosuke Kashiwagi</a>, <a href="/search/eess?searchtype=author&query=Futami%2C+H">Hayato Futami</a>, <a href="/search/eess?searchtype=author&query=Tsunoo%2C+E">Emiru Tsunoo</a>, <a href="/search/eess?searchtype=author&query=Arora%2C+S">Siddhant Arora</a>, <a href="/search/eess?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.12611v1-abstract-short" style="display: inline;"> End-to-end multilingual speech recognition models handle multiple languages through a single model, often incorporating language identification to automatically detect the language of incoming speech. Since the common scenario is where the language is already known, these models can perform as language-specific by using language information as prompts, which is particularly beneficial for attentio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.12611v1-abstract-full').style.display = 'inline'; document.getElementById('2406.12611v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.12611v1-abstract-full" style="display: none;"> End-to-end multilingual speech recognition models handle multiple languages through a single model, often incorporating language identification to automatically detect the language of incoming speech. Since the common scenario is where the language is already known, these models can perform as language-specific by using language information as prompts, which is particularly beneficial for attention-based encoder-decoder architectures. However, the Connectionist Temporal Classification (CTC) approach, which enhances recognition via joint decoding and multi-task training, does not normally incorporate language prompts due to its conditionally independent output tokens. To overcome this, we introduce an encoder prompting technique within the self-conditioned CTC framework, enabling language-specific adaptation of the CTC model in a zero-shot manner. Our method has shown to significantly reduce errors by 28% on average and by 41% on low-resource languages. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.12611v1-abstract-full').style.display = 'none'; document.getElementById('2406.12611v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by INTERSPEECH 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.12317">arXiv:2406.12317</a> <span> [<a href="https://arxiv.org/pdf/2406.12317">pdf</a>, <a href="https://arxiv.org/format/2406.12317">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Finding Task-specific Subnetworks in Multi-task Spoken Language Understanding Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Futami%2C+H">Hayato Futami</a>, <a href="/search/eess?searchtype=author&query=Arora%2C+S">Siddhant Arora</a>, <a href="/search/eess?searchtype=author&query=Kashiwagi%2C+Y">Yosuke Kashiwagi</a>, <a href="/search/eess?searchtype=author&query=Tsunoo%2C+E">Emiru Tsunoo</a>, <a href="/search/eess?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.12317v1-abstract-short" style="display: inline;"> Recently, multi-task spoken language understanding (SLU) models have emerged, designed to address various speech processing tasks. However, these models often rely on a large number of parameters. Also, they often encounter difficulties in adapting to new data for a specific task without experiencing catastrophic forgetting of previously trained tasks. In this study, we propose finding task-specif… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.12317v1-abstract-full').style.display = 'inline'; document.getElementById('2406.12317v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.12317v1-abstract-full" style="display: none;"> Recently, multi-task spoken language understanding (SLU) models have emerged, designed to address various speech processing tasks. However, these models often rely on a large number of parameters. Also, they often encounter difficulties in adapting to new data for a specific task without experiencing catastrophic forgetting of previously trained tasks. In this study, we propose finding task-specific subnetworks within a multi-task SLU model via neural network pruning. In addition to model compression, we expect that the forgetting of previously trained tasks can be mitigated by updating only a task-specific subnetwork. We conduct experiments on top of the state-of-the-art multi-task SLU model ``UniverSLU'', trained for several tasks such as emotion recognition (ER), intent classification (IC), and automatic speech recognition (ASR). We show that pruned models were successful in adapting to additional ASR or IC data with minimal performance degradation on previously trained tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.12317v1-abstract-full').style.display = 'none'; document.getElementById('2406.12317v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to Interspeech2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.10083">arXiv:2406.10083</a> <span> [<a href="https://arxiv.org/pdf/2406.10083">pdf</a>, <a href="https://arxiv.org/format/2406.10083">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> On the Evaluation of Speech Foundation Models for Spoken Language Understanding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Arora%2C+S">Siddhant Arora</a>, <a href="/search/eess?searchtype=author&query=Pasad%2C+A">Ankita Pasad</a>, <a href="/search/eess?searchtype=author&query=Chien%2C+C">Chung-Ming Chien</a>, <a href="/search/eess?searchtype=author&query=Han%2C+J">Jionghao Han</a>, <a href="/search/eess?searchtype=author&query=Sharma%2C+R">Roshan Sharma</a>, <a href="/search/eess?searchtype=author&query=Jung%2C+J">Jee-weon Jung</a>, <a href="/search/eess?searchtype=author&query=Dhamyal%2C+H">Hira Dhamyal</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+W">William Chen</a>, <a href="/search/eess?searchtype=author&query=Shon%2C+S">Suwon Shon</a>, <a href="/search/eess?searchtype=author&query=Lee%2C+H">Hung-yi Lee</a>, <a href="/search/eess?searchtype=author&query=Livescu%2C+K">Karen Livescu</a>, <a href="/search/eess?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.10083v1-abstract-short" style="display: inline;"> The Spoken Language Understanding Evaluation (SLUE) suite of benchmark tasks was recently introduced to address the need for open resources and benchmarking of complex spoken language understanding (SLU) tasks, including both classification and sequence generation tasks, on natural speech. The benchmark has demonstrated preliminary success in using pre-trained speech foundation models (SFM) for th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.10083v1-abstract-full').style.display = 'inline'; document.getElementById('2406.10083v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.10083v1-abstract-full" style="display: none;"> The Spoken Language Understanding Evaluation (SLUE) suite of benchmark tasks was recently introduced to address the need for open resources and benchmarking of complex spoken language understanding (SLU) tasks, including both classification and sequence generation tasks, on natural speech. The benchmark has demonstrated preliminary success in using pre-trained speech foundation models (SFM) for these SLU tasks. However, the community still lacks a fine-grained understanding of the comparative utility of different SFMs. Inspired by this, we ask: which SFMs offer the most benefits for these complex SLU tasks, and what is the most effective approach for incorporating these SFMs? To answer this, we perform an extensive evaluation of multiple supervised and self-supervised SFMs using several evaluation protocols: (i) frozen SFMs with a lightweight prediction head, (ii) frozen SFMs with a complex prediction head, and (iii) fine-tuned SFMs with a lightweight prediction head. Although the supervised SFMs are pre-trained on much more speech recognition data (with labels), they do not always outperform self-supervised SFMs; the latter tend to perform at least as well as, and sometimes better than, supervised SFMs, especially on the sequence generation tasks in SLUE. While there is no universally optimal way of incorporating SFMs, the complex prediction head gives the best performance for most tasks, although it increases the inference time. We also introduce an open-source toolkit and performance leaderboard, SLUE-PERB, for these tasks and modeling strategies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.10083v1-abstract-full').style.display = 'none'; document.getElementById('2406.10083v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at ACL Findings 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.09869">arXiv:2406.09869</a> <span> [<a href="https://arxiv.org/pdf/2406.09869">pdf</a>, <a href="https://arxiv.org/ps/2406.09869">ps</a>, <a href="https://arxiv.org/format/2406.09869">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> MMM: Multi-Layer Multi-Residual Multi-Stream Discrete Speech Representation from Self-supervised Learning Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Shi%2C+J">Jiatong Shi</a>, <a href="/search/eess?searchtype=author&query=Ma%2C+X">Xutai Ma</a>, <a href="/search/eess?searchtype=author&query=Inaguma%2C+H">Hirofumi Inaguma</a>, <a href="/search/eess?searchtype=author&query=Sun%2C+A">Anna Sun</a>, <a href="/search/eess?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.09869v1-abstract-short" style="display: inline;"> Speech discrete representation has proven effective in various downstream applications due to its superior compression rate of the waveform, fast convergence during training, and compatibility with other modalities. Discrete units extracted from self-supervised learning (SSL) models have emerged as a prominent approach for obtaining speech discrete representation. However, while discrete units hav… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.09869v1-abstract-full').style.display = 'inline'; document.getElementById('2406.09869v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.09869v1-abstract-full" style="display: none;"> Speech discrete representation has proven effective in various downstream applications due to its superior compression rate of the waveform, fast convergence during training, and compatibility with other modalities. Discrete units extracted from self-supervised learning (SSL) models have emerged as a prominent approach for obtaining speech discrete representation. However, while discrete units have shown effectiveness compared to spectral features, they still lag behind continuous SSL representations. In this work, we propose MMM, a multi-layer multi-residual multi-stream discrete units extraction method from SSL. Specifically, we introduce iterative residual vector quantization with K-means for different layers in an SSL model to extract multi-stream speech discrete representation. Through extensive experiments in speech recognition, speech resynthesis, and text-to-speech, we demonstrate the proposed MMM can surpass or on-par with neural codec's performance under various conditions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.09869v1-abstract-full').style.display = 'none'; document.getElementById('2406.09869v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by Interspeech2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.09345">arXiv:2406.09345</a> <span> [<a href="https://arxiv.org/pdf/2406.09345">pdf</a>, <a href="https://arxiv.org/format/2406.09345">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> DiscreteSLU: A Large Language Model with Self-Supervised Discrete Speech Units for Spoken Language Understanding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Shon%2C+S">Suwon Shon</a>, <a href="/search/eess?searchtype=author&query=Kim%2C+K">Kwangyoun Kim</a>, <a href="/search/eess?searchtype=author&query=Hsu%2C+Y">Yi-Te Hsu</a>, <a href="/search/eess?searchtype=author&query=Sridhar%2C+P">Prashant Sridhar</a>, <a href="/search/eess?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a>, <a href="/search/eess?searchtype=author&query=Livescu%2C+K">Karen Livescu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.09345v1-abstract-short" style="display: inline;"> The integration of pre-trained text-based large language models (LLM) with speech input has enabled instruction-following capabilities for diverse speech tasks. This integration requires the use of a speech encoder, a speech adapter, and an LLM, trained on diverse tasks. We propose the use of discrete speech units (DSU), rather than continuous-valued speech encoder outputs, that are converted to t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.09345v1-abstract-full').style.display = 'inline'; document.getElementById('2406.09345v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.09345v1-abstract-full" style="display: none;"> The integration of pre-trained text-based large language models (LLM) with speech input has enabled instruction-following capabilities for diverse speech tasks. This integration requires the use of a speech encoder, a speech adapter, and an LLM, trained on diverse tasks. We propose the use of discrete speech units (DSU), rather than continuous-valued speech encoder outputs, that are converted to the LLM token embedding space using the speech adapter. We generate DSU using a self-supervised speech encoder followed by k-means clustering. The proposed model shows robust performance on speech inputs from seen/unseen domains and instruction-following capability in spoken question answering. We also explore various types of DSU extracted from different layers of the self-supervised speech encoder, as well as Mel frequency Cepstral Coefficients (MFCC). Our findings suggest that the ASR task and datasets are not crucial in instruction-tuning for spoken question answering tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.09345v1-abstract-full').style.display = 'none'; document.getElementById('2406.09345v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.09282">arXiv:2406.09282</a> <span> [<a href="https://arxiv.org/pdf/2406.09282">pdf</a>, <a href="https://arxiv.org/format/2406.09282">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> On the Effects of Heterogeneous Data Sources on Speech-to-Text Foundation Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Tian%2C+J">Jinchuan Tian</a>, <a href="/search/eess?searchtype=author&query=Peng%2C+Y">Yifan Peng</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+W">William Chen</a>, <a href="/search/eess?searchtype=author&query=Choi%2C+K">Kwanghee Choi</a>, <a href="/search/eess?searchtype=author&query=Livescu%2C+K">Karen Livescu</a>, <a href="/search/eess?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.09282v1-abstract-short" style="display: inline;"> The Open Whisper-style Speech Model (OWSM) series was introduced to achieve full transparency in building advanced speech-to-text (S2T) foundation models. To this end, OWSM models are trained on 25 public speech datasets, which are heterogeneous in multiple ways. In this study, we advance the OWSM series by introducing OWSM v3.2, which improves on prior models by investigating and addressing the i… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.09282v1-abstract-full').style.display = 'inline'; document.getElementById('2406.09282v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.09282v1-abstract-full" style="display: none;"> The Open Whisper-style Speech Model (OWSM) series was introduced to achieve full transparency in building advanced speech-to-text (S2T) foundation models. To this end, OWSM models are trained on 25 public speech datasets, which are heterogeneous in multiple ways. In this study, we advance the OWSM series by introducing OWSM v3.2, which improves on prior models by investigating and addressing the impacts of this data heterogeneity. Our study begins with a detailed analysis of each dataset, from which we derive two key strategies: data filtering with proxy task to enhance data quality, and the incorporation of punctuation and true-casing using an open large language model (LLM). With all other configurations staying the same, OWSM v3.2 improves performance over the OWSM v3.1 baseline while using 15% less training data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.09282v1-abstract-full').style.display = 'none'; document.getElementById('2406.09282v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.08761">arXiv:2406.08761</a> <span> [<a href="https://arxiv.org/pdf/2406.08761">pdf</a>, <a href="https://arxiv.org/format/2406.08761">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> VISinger2+: End-to-End Singing Voice Synthesis Augmented by Self-Supervised Learning Representation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yu%2C+Y">Yifeng Yu</a>, <a href="/search/eess?searchtype=author&query=Shi%2C+J">Jiatong Shi</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+Y">Yuning Wu</a>, <a href="/search/eess?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.08761v1-abstract-short" style="display: inline;"> Singing Voice Synthesis (SVS) has witnessed significant advancements with the advent of deep learning techniques. However, a significant challenge in SVS is the scarcity of labeled singing voice data, which limits the effectiveness of supervised learning methods. In response to this challenge, this paper introduces a novel approach to enhance the quality of SVS by leveraging unlabeled data from pr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.08761v1-abstract-full').style.display = 'inline'; document.getElementById('2406.08761v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.08761v1-abstract-full" style="display: none;"> Singing Voice Synthesis (SVS) has witnessed significant advancements with the advent of deep learning techniques. However, a significant challenge in SVS is the scarcity of labeled singing voice data, which limits the effectiveness of supervised learning methods. In response to this challenge, this paper introduces a novel approach to enhance the quality of SVS by leveraging unlabeled data from pre-trained self-supervised learning models. Building upon the existing VISinger2 framework, this study integrates additional spectral feature information into the system to enhance its performance. The integration aims to harness the rich acoustic features from the pre-trained models, thereby enriching the synthesis and yielding a more natural and expressive singing voice. Experimental results in various corpora demonstrate the efficacy of this approach in improving the overall quality of synthesized singing voices in both objective and subjective metrics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.08761v1-abstract-full').style.display = 'none'; document.getElementById('2406.08761v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">4 pages, 2 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.08641">arXiv:2406.08641</a> <span> [<a href="https://arxiv.org/pdf/2406.08641">pdf</a>, <a href="https://arxiv.org/ps/2406.08641">ps</a>, <a href="https://arxiv.org/format/2406.08641">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> ML-SUPERB 2.0: Benchmarking Multilingual Speech Models Across Modeling Constraints, Languages, and Datasets </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Shi%2C+J">Jiatong Shi</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+S">Shih-Heng Wang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+W">William Chen</a>, <a href="/search/eess?searchtype=author&query=Bartelds%2C+M">Martijn Bartelds</a>, <a href="/search/eess?searchtype=author&query=Kumar%2C+V+B">Vanya Bannihatti Kumar</a>, <a href="/search/eess?searchtype=author&query=Tian%2C+J">Jinchuan Tian</a>, <a href="/search/eess?searchtype=author&query=Chang%2C+X">Xuankai Chang</a>, <a href="/search/eess?searchtype=author&query=Jurafsky%2C+D">Dan Jurafsky</a>, <a href="/search/eess?searchtype=author&query=Livescu%2C+K">Karen Livescu</a>, <a href="/search/eess?searchtype=author&query=Lee%2C+H">Hung-yi Lee</a>, <a href="/search/eess?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.08641v1-abstract-short" style="display: inline;"> ML-SUPERB evaluates self-supervised learning (SSL) models on the tasks of language identification and automatic speech recognition (ASR). This benchmark treats the models as feature extractors and uses a single shallow downstream model, which can be fine-tuned for a downstream task. However, real-world use cases may require different configurations. This paper presents ML-SUPERB~2.0, which is a ne… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.08641v1-abstract-full').style.display = 'inline'; document.getElementById('2406.08641v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.08641v1-abstract-full" style="display: none;"> ML-SUPERB evaluates self-supervised learning (SSL) models on the tasks of language identification and automatic speech recognition (ASR). This benchmark treats the models as feature extractors and uses a single shallow downstream model, which can be fine-tuned for a downstream task. However, real-world use cases may require different configurations. This paper presents ML-SUPERB~2.0, which is a new benchmark for evaluating pre-trained SSL and supervised speech models across downstream models, fine-tuning setups, and efficient model adaptation approaches. We find performance improvements over the setup of ML-SUPERB. However, performance depends on the downstream model design. Also, we find large performance differences between languages and datasets, suggesting the need for more targeted approaches to improve multilingual ASR performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.08641v1-abstract-full').style.display = 'none'; document.getElementById('2406.08641v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by Interspeech 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.08619">arXiv:2406.08619</a> <span> [<a href="https://arxiv.org/pdf/2406.08619">pdf</a>, <a href="https://arxiv.org/format/2406.08619">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Self-Supervised Speech Representations are More Phonetic than Semantic </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Choi%2C+K">Kwanghee Choi</a>, <a href="/search/eess?searchtype=author&query=Pasad%2C+A">Ankita Pasad</a>, <a href="/search/eess?searchtype=author&query=Nakamura%2C+T">Tomohiko Nakamura</a>, <a href="/search/eess?searchtype=author&query=Fukayama%2C+S">Satoru Fukayama</a>, <a href="/search/eess?searchtype=author&query=Livescu%2C+K">Karen Livescu</a>, <a href="/search/eess?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.08619v1-abstract-short" style="display: inline;"> Self-supervised speech models (S3Ms) have become an effective backbone for speech applications. Various analyses suggest that S3Ms encode linguistic properties. In this work, we seek a more fine-grained analysis of the word-level linguistic properties encoded in S3Ms. Specifically, we curate a novel dataset of near homophone (phonetically similar) and synonym (semantically similar) word pairs and… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.08619v1-abstract-full').style.display = 'inline'; document.getElementById('2406.08619v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.08619v1-abstract-full" style="display: none;"> Self-supervised speech models (S3Ms) have become an effective backbone for speech applications. Various analyses suggest that S3Ms encode linguistic properties. In this work, we seek a more fine-grained analysis of the word-level linguistic properties encoded in S3Ms. Specifically, we curate a novel dataset of near homophone (phonetically similar) and synonym (semantically similar) word pairs and measure the similarities between S3M word representation pairs. Our study reveals that S3M representations consistently and significantly exhibit more phonetic than semantic similarity. Further, we question whether widely used intent classification datasets such as Fluent Speech Commands and Snips Smartlights are adequate for measuring semantic abilities. Our simple baseline, using only the word identity, surpasses S3M-based models. This corroborates our findings and suggests that high scores on these datasets do not necessarily guarantee the presence of semantic content. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.08619v1-abstract-full').style.display = 'none'; document.getElementById('2406.08619v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to Interspeech 2024. Source code at https://github.com/juice500ml/phonetic_semantic_probing</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.08396">arXiv:2406.08396</a> <span> [<a href="https://arxiv.org/pdf/2406.08396">pdf</a>, <a href="https://arxiv.org/format/2406.08396">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Neural Blind Source Separation and Diarization for Distant Speech Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Bando%2C+Y">Yoshiaki Bando</a>, <a href="/search/eess?searchtype=author&query=Nakamura%2C+T">Tomohiko Nakamura</a>, <a href="/search/eess?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.08396v1-abstract-short" style="display: inline;"> This paper presents a neural method for distant speech recognition (DSR) that jointly separates and diarizes speech mixtures without supervision by isolated signals. A standard separation method for multi-talker DSR is a statistical multichannel method called guided source separation (GSS). While GSS does not require signal-level supervision, it relies on speaker diarization results to handle unkn… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.08396v1-abstract-full').style.display = 'inline'; document.getElementById('2406.08396v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.08396v1-abstract-full" style="display: none;"> This paper presents a neural method for distant speech recognition (DSR) that jointly separates and diarizes speech mixtures without supervision by isolated signals. A standard separation method for multi-talker DSR is a statistical multichannel method called guided source separation (GSS). While GSS does not require signal-level supervision, it relies on speaker diarization results to handle unknown numbers of active speakers. To overcome this limitation, we introduce and train a neural inference model in a weakly-supervised manner, employing the objective function of a statistical separation method. This training requires only multichannel mixtures and their temporal annotations of speaker activities. In contrast to GSS, the trained model can jointly separate and diarize speech mixtures without any auxiliary information. The experiments with the AMI corpus show that our method outperforms GSS with oracle diarization results regarding word error rates. The code is available online. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.08396v1-abstract-full').style.display = 'none'; document.getElementById('2406.08396v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 3 figures, accepted to INTERSPEECH 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.07725">arXiv:2406.07725</a> <span> [<a href="https://arxiv.org/pdf/2406.07725">pdf</a>, <a href="https://arxiv.org/ps/2406.07725">ps</a>, <a href="https://arxiv.org/format/2406.07725">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> The Interspeech 2024 Challenge on Speech Processing Using Discrete Units </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Chang%2C+X">Xuankai Chang</a>, <a href="/search/eess?searchtype=author&query=Shi%2C+J">Jiatong Shi</a>, <a href="/search/eess?searchtype=author&query=Tian%2C+J">Jinchuan Tian</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+Y">Yuning Wu</a>, <a href="/search/eess?searchtype=author&query=Tang%2C+Y">Yuxun Tang</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+Y">Yihan Wu</a>, <a href="/search/eess?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a>, <a href="/search/eess?searchtype=author&query=Adi%2C+Y">Yossi Adi</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+X">Xie Chen</a>, <a href="/search/eess?searchtype=author&query=Jin%2C+Q">Qin Jin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.07725v1-abstract-short" style="display: inline;"> Representing speech and audio signals in discrete units has become a compelling alternative to traditional high-dimensional feature vectors. Numerous studies have highlighted the efficacy of discrete units in various applications such as speech compression and restoration, speech recognition, and speech generation. To foster exploration in this domain, we introduce the Interspeech 2024 Challenge,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.07725v1-abstract-full').style.display = 'inline'; document.getElementById('2406.07725v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.07725v1-abstract-full" style="display: none;"> Representing speech and audio signals in discrete units has become a compelling alternative to traditional high-dimensional feature vectors. Numerous studies have highlighted the efficacy of discrete units in various applications such as speech compression and restoration, speech recognition, and speech generation. To foster exploration in this domain, we introduce the Interspeech 2024 Challenge, which focuses on new speech processing benchmarks using discrete units. It encompasses three pivotal tasks, namely multilingual automatic speech recognition, text-to-speech, and singing voice synthesis, and aims to assess the potential applicability of discrete units in these tasks. This paper outlines the challenge designs and baseline descriptions. We also collate baseline and selected submission systems, along with preliminary findings, offering valuable contributions to future research in this evolving field. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.07725v1-abstract-full').style.display = 'none'; document.getElementById('2406.07725v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This manuscript has been accepted by Interspeech2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.06185">arXiv:2406.06185</a> <span> [<a href="https://arxiv.org/pdf/2406.06185">pdf</a>, <a href="https://arxiv.org/format/2406.06185">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> EARS: An Anechoic Fullband Speech Dataset Benchmarked for Speech Enhancement and Dereverberation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Richter%2C+J">Julius Richter</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+Y">Yi-Chiao Wu</a>, <a href="/search/eess?searchtype=author&query=Krenn%2C+S">Steven Krenn</a>, <a href="/search/eess?searchtype=author&query=Welker%2C+S">Simon Welker</a>, <a href="/search/eess?searchtype=author&query=Lay%2C+B">Bunlong Lay</a>, <a href="/search/eess?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a>, <a href="/search/eess?searchtype=author&query=Richard%2C+A">Alexander Richard</a>, <a href="/search/eess?searchtype=author&query=Gerkmann%2C+T">Timo Gerkmann</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.06185v2-abstract-short" style="display: inline;"> We release the EARS (Expressive Anechoic Recordings of Speech) dataset, a high-quality speech dataset comprising 107 speakers from diverse backgrounds, totaling in 100 hours of clean, anechoic speech data. The dataset covers a large range of different speaking styles, including emotional speech, different reading styles, non-verbal sounds, and conversational freeform speech. We benchmark various m… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.06185v2-abstract-full').style.display = 'inline'; document.getElementById('2406.06185v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.06185v2-abstract-full" style="display: none;"> We release the EARS (Expressive Anechoic Recordings of Speech) dataset, a high-quality speech dataset comprising 107 speakers from diverse backgrounds, totaling in 100 hours of clean, anechoic speech data. The dataset covers a large range of different speaking styles, including emotional speech, different reading styles, non-verbal sounds, and conversational freeform speech. We benchmark various methods for speech enhancement and dereverberation on the dataset and evaluate their performance through a set of instrumental metrics. In addition, we conduct a listening test with 20 participants for the speech enhancement task, where a generative method is preferred. We introduce a blind test set that allows for automatic online evaluation of uploaded data. Dataset download links and automatic evaluation server can be found online. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.06185v2-abstract-full').style.display = 'none'; document.getElementById('2406.06185v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at Interspeech 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.05339">arXiv:2406.05339</a> <span> [<a href="https://arxiv.org/pdf/2406.05339">pdf</a>, <a href="https://arxiv.org/format/2406.05339">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> To what extent can ASV systems naturally defend against spoofing attacks? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Jung%2C+J">Jee-weon Jung</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+X">Xin Wang</a>, <a href="/search/eess?searchtype=author&query=Evans%2C+N">Nicholas Evans</a>, <a href="/search/eess?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a>, <a href="/search/eess?searchtype=author&query=Shim%2C+H">Hye-jin Shim</a>, <a href="/search/eess?searchtype=author&query=Tak%2C+H">Hemlata Tak</a>, <a href="/search/eess?searchtype=author&query=Arora%2C+S">Sidhhant Arora</a>, <a href="/search/eess?searchtype=author&query=Yamagishi%2C+J">Junichi Yamagishi</a>, <a href="/search/eess?searchtype=author&query=Chung%2C+J+S">Joon Son Chung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.05339v3-abstract-short" style="display: inline;"> The current automatic speaker verification (ASV) task involves making binary decisions on two types of trials: target and non-target. However, emerging advancements in speech generation technology pose significant threats to the reliability of ASV systems. This study investigates whether ASV effortlessly acquires robustness against spoofing attacks (i.e., zero-shot capability) by systematically ex… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.05339v3-abstract-full').style.display = 'inline'; document.getElementById('2406.05339v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.05339v3-abstract-full" style="display: none;"> The current automatic speaker verification (ASV) task involves making binary decisions on two types of trials: target and non-target. However, emerging advancements in speech generation technology pose significant threats to the reliability of ASV systems. This study investigates whether ASV effortlessly acquires robustness against spoofing attacks (i.e., zero-shot capability) by systematically exploring diverse ASV systems and spoofing attacks, ranging from traditional to cutting-edge techniques. Through extensive analyses conducted on eight distinct ASV systems and 29 spoofing attack systems, we demonstrate that the evolution of ASV inherently incorporates defense mechanisms against spoofing attacks. Nevertheless, our findings also underscore that the advancement of spoofing attacks far outpaces that of ASV systems, hence necessitating further research on spoofing-robust ASV methodologies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.05339v3-abstract-full').style.display = 'none'; document.getElementById('2406.05339v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 3 figures, 3 tables, Interspeech 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.04660">arXiv:2406.04660</a> <span> [<a href="https://arxiv.org/pdf/2406.04660">pdf</a>, <a href="https://arxiv.org/format/2406.04660">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.21437/Interspeech.2024-1239">10.21437/Interspeech.2024-1239 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> URGENT Challenge: Universality, Robustness, and Generalizability For Speech Enhancement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+W">Wangyou Zhang</a>, <a href="/search/eess?searchtype=author&query=Scheibler%2C+R">Robin Scheibler</a>, <a href="/search/eess?searchtype=author&query=Saijo%2C+K">Kohei Saijo</a>, <a href="/search/eess?searchtype=author&query=Cornell%2C+S">Samuele Cornell</a>, <a href="/search/eess?searchtype=author&query=Li%2C+C">Chenda Li</a>, <a href="/search/eess?searchtype=author&query=Ni%2C+Z">Zhaoheng Ni</a>, <a href="/search/eess?searchtype=author&query=Kumar%2C+A">Anurag Kumar</a>, <a href="/search/eess?searchtype=author&query=Pirklbauer%2C+J">Jan Pirklbauer</a>, <a href="/search/eess?searchtype=author&query=Sach%2C+M">Marvin Sach</a>, <a href="/search/eess?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a>, <a href="/search/eess?searchtype=author&query=Fingscheidt%2C+T">Tim Fingscheidt</a>, <a href="/search/eess?searchtype=author&query=Qian%2C+Y">Yanmin Qian</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.04660v1-abstract-short" style="display: inline;"> The last decade has witnessed significant advancements in deep learning-based speech enhancement (SE). However, most existing SE research has limitations on the coverage of SE sub-tasks, data diversity and amount, and evaluation metrics. To fill this gap and promote research toward universal SE, we establish a new SE challenge, named URGENT, to focus on the universality, robustness, and generaliza… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.04660v1-abstract-full').style.display = 'inline'; document.getElementById('2406.04660v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.04660v1-abstract-full" style="display: none;"> The last decade has witnessed significant advancements in deep learning-based speech enhancement (SE). However, most existing SE research has limitations on the coverage of SE sub-tasks, data diversity and amount, and evaluation metrics. To fill this gap and promote research toward universal SE, we establish a new SE challenge, named URGENT, to focus on the universality, robustness, and generalizability of SE. We aim to extend the SE definition to cover different sub-tasks to explore the limits of SE models, starting from denoising, dereverberation, bandwidth extension, and declipping. A novel framework is proposed to unify all these sub-tasks in a single model, allowing the use of all existing SE approaches. We collected public speech and noise data from different domains to construct diverse evaluation data. Finally, we discuss the insights gained from our preliminary baseline experiments based on both generative and discriminative SE methods with 12 curated metrics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.04660v1-abstract-full').style.display = 'none'; document.getElementById('2406.04660v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">6 pages, 3 figures, 3 tables. Accepted by Interspeech 2024. An extended version of the accepted manuscript with appendix</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.04269">arXiv:2406.04269</a> <span> [<a href="https://arxiv.org/pdf/2406.04269">pdf</a>, <a href="https://arxiv.org/format/2406.04269">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.21437/Interspeech.2024-1266">10.21437/Interspeech.2024-1266 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Beyond Performance Plateaus: A Comprehensive Study on Scalability in Speech Enhancement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+W">Wangyou Zhang</a>, <a href="/search/eess?searchtype=author&query=Saijo%2C+K">Kohei Saijo</a>, <a href="/search/eess?searchtype=author&query=Jung%2C+J">Jee-weon Jung</a>, <a href="/search/eess?searchtype=author&query=Li%2C+C">Chenda Li</a>, <a href="/search/eess?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a>, <a href="/search/eess?searchtype=author&query=Qian%2C+Y">Yanmin Qian</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.04269v1-abstract-short" style="display: inline;"> Deep learning-based speech enhancement (SE) models have achieved impressive performance in the past decade. Numerous advanced architectures have been designed to deliver state-of-the-art performance; however, their scalability potential remains unrevealed. Meanwhile, the majority of research focuses on small-sized datasets with restricted diversity, leading to a plateau in performance improvement.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.04269v1-abstract-full').style.display = 'inline'; document.getElementById('2406.04269v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.04269v1-abstract-full" style="display: none;"> Deep learning-based speech enhancement (SE) models have achieved impressive performance in the past decade. Numerous advanced architectures have been designed to deliver state-of-the-art performance; however, their scalability potential remains unrevealed. Meanwhile, the majority of research focuses on small-sized datasets with restricted diversity, leading to a plateau in performance improvement. In this paper, we aim to provide new insights for addressing the above issues by exploring the scalability of SE models in terms of architectures, model sizes, compute budgets, and dataset sizes. Our investigation involves several popular SE architectures and speech data from different domains. Experiments reveal both similarities and distinctions between the scaling effects in SE and other tasks such as speech recognition. These findings further provide insights into the under-explored SE directions, e.g., larger-scale multi-domain corpora and efficiently scalable architectures. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.04269v1-abstract-full').style.display = 'none'; document.getElementById('2406.04269v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 3 figures, 4 tables, Accepted by Interspeech 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.02950">arXiv:2406.02950</a> <span> [<a href="https://arxiv.org/pdf/2406.02950">pdf</a>, <a href="https://arxiv.org/format/2406.02950">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> 4D ASR: Joint Beam Search Integrating CTC, Attention, Transducer, and Mask Predict Decoders </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Sudo%2C+Y">Yui Sudo</a>, <a href="/search/eess?searchtype=author&query=Shakeel%2C+M">Muhammad Shakeel</a>, <a href="/search/eess?searchtype=author&query=Fukumoto%2C+Y">Yosuke Fukumoto</a>, <a href="/search/eess?searchtype=author&query=Yan%2C+B">Brian Yan</a>, <a href="/search/eess?searchtype=author&query=Shi%2C+J">Jiatong Shi</a>, <a href="/search/eess?searchtype=author&query=Peng%2C+Y">Yifan Peng</a>, <a href="/search/eess?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.02950v1-abstract-short" style="display: inline;"> End-to-end automatic speech recognition (E2E-ASR) can be classified into several network architectures, such as connectionist temporal classification (CTC), recurrent neural network transducer (RNN-T), attention-based encoder-decoder, and mask-predict models. Each network architecture has advantages and disadvantages, leading practitioners to switch between these different models depending on appl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.02950v1-abstract-full').style.display = 'inline'; document.getElementById('2406.02950v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.02950v1-abstract-full" style="display: none;"> End-to-end automatic speech recognition (E2E-ASR) can be classified into several network architectures, such as connectionist temporal classification (CTC), recurrent neural network transducer (RNN-T), attention-based encoder-decoder, and mask-predict models. Each network architecture has advantages and disadvantages, leading practitioners to switch between these different models depending on application requirements. Instead of building separate models, we propose a joint modeling scheme where four decoders (CTC, RNN-T, attention, and mask-predict) share the same encoder -- we refer to this as 4D modeling. The 4D model is trained using multitask learning, which will bring model regularization and maximize the model robustness thanks to their complementary properties. To efficiently train the 4D model, we introduce a two-stage training strategy that stabilizes multitask learning. In addition, we propose three novel one-pass beam search algorithms by combining three decoders (CTC, RNN-T, and attention) to further improve performance. These three beam search algorithms differ in which decoder is used as the primary decoder. We carefully evaluate the performance and computational tradeoffs associated with each algorithm. Experimental results demonstrate that the jointly trained 4D model outperforms the E2E-ASR models trained with only one individual decoder. Furthermore, we demonstrate that the proposed one-pass beam search algorithm outperforms the previously proposed CTC/attention decoding. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.02950v1-abstract-full').style.display = 'none'; document.getElementById('2406.02950v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">submitted to IEEE/ACM Transactions on Audio Speech and Language Processing</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.02560">arXiv:2406.02560</a> <span> [<a href="https://arxiv.org/pdf/2406.02560">pdf</a>, <a href="https://arxiv.org/format/2406.02560">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Less Peaky and More Accurate CTC Forced Alignment by Label Priors </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Huang%2C+R">Ruizhe Huang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+X">Xiaohui Zhang</a>, <a href="/search/eess?searchtype=author&query=Ni%2C+Z">Zhaoheng Ni</a>, <a href="/search/eess?searchtype=author&query=Sun%2C+L">Li Sun</a>, <a href="/search/eess?searchtype=author&query=Hira%2C+M">Moto Hira</a>, <a href="/search/eess?searchtype=author&query=Hwang%2C+J">Jeff Hwang</a>, <a href="/search/eess?searchtype=author&query=Manohar%2C+V">Vimal Manohar</a>, <a href="/search/eess?searchtype=author&query=Pratap%2C+V">Vineel Pratap</a>, <a href="/search/eess?searchtype=author&query=Wiesner%2C+M">Matthew Wiesner</a>, <a href="/search/eess?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a>, <a href="/search/eess?searchtype=author&query=Povey%2C+D">Daniel Povey</a>, <a href="/search/eess?searchtype=author&query=Khudanpur%2C+S">Sanjeev Khudanpur</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.02560v3-abstract-short" style="display: inline;"> Connectionist temporal classification (CTC) models are known to have peaky output distributions. Such behavior is not a problem for automatic speech recognition (ASR), but it can cause inaccurate forced alignments (FA), especially at finer granularity, e.g., phoneme level. This paper aims at alleviating the peaky behavior for CTC and improve its suitability for forced alignment generation, by leve… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.02560v3-abstract-full').style.display = 'inline'; document.getElementById('2406.02560v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.02560v3-abstract-full" style="display: none;"> Connectionist temporal classification (CTC) models are known to have peaky output distributions. Such behavior is not a problem for automatic speech recognition (ASR), but it can cause inaccurate forced alignments (FA), especially at finer granularity, e.g., phoneme level. This paper aims at alleviating the peaky behavior for CTC and improve its suitability for forced alignment generation, by leveraging label priors, so that the scores of alignment paths containing fewer blanks are boosted and maximized during training. As a result, our CTC model produces less peaky posteriors and is able to more accurately predict the offset of the tokens besides their onset. It outperforms the standard CTC model and a heuristics-based approach for obtaining CTC's token offset timestamps by 12-40% in phoneme and word boundary errors (PBE and WBE) measured on the Buckeye and TIMIT data. Compared with the most widely used FA toolkit Montreal Forced Aligner (MFA), our method performs similarly on PBE/WBE on Buckeye, yet falls behind MFA on TIMIT. Nevertheless, our method has a much simpler training pipeline and better runtime efficiency. Our training recipe and pretrained model are released in TorchAudio. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.02560v3-abstract-full').style.display = 'none'; document.getElementById('2406.02560v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICASSP 2024. Github repo: https://github.com/huangruizhe/audio/tree/aligner_label_priors</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.00899">arXiv:2406.00899</a> <span> [<a href="https://arxiv.org/pdf/2406.00899">pdf</a>, <a href="https://arxiv.org/format/2406.00899">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> YODAS: Youtube-Oriented Dataset for Audio and Speech </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+X">Xinjian Li</a>, <a href="/search/eess?searchtype=author&query=Takamichi%2C+S">Shinnosuke Takamichi</a>, <a href="/search/eess?searchtype=author&query=Saeki%2C+T">Takaaki Saeki</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+W">William Chen</a>, <a href="/search/eess?searchtype=author&query=Shiota%2C+S">Sayaka Shiota</a>, <a href="/search/eess?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.00899v1-abstract-short" style="display: inline;"> In this study, we introduce YODAS (YouTube-Oriented Dataset for Audio and Speech), a large-scale, multilingual dataset comprising currently over 500k hours of speech data in more than 100 languages, sourced from both labeled and unlabeled YouTube speech datasets. The labeled subsets, including manual or automatic subtitles, facilitate supervised model training. Conversely, the unlabeled subsets ar… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.00899v1-abstract-full').style.display = 'inline'; document.getElementById('2406.00899v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.00899v1-abstract-full" style="display: none;"> In this study, we introduce YODAS (YouTube-Oriented Dataset for Audio and Speech), a large-scale, multilingual dataset comprising currently over 500k hours of speech data in more than 100 languages, sourced from both labeled and unlabeled YouTube speech datasets. The labeled subsets, including manual or automatic subtitles, facilitate supervised model training. Conversely, the unlabeled subsets are apt for self-supervised learning applications. YODAS is distinctive as the first publicly available dataset of its scale, and it is distributed under a Creative Commons license. We introduce the collection methodology utilized for YODAS, which contributes to the large-scale speech dataset construction. Subsequently, we provide a comprehensive analysis of speech, text contained within the dataset. Finally, we describe the speech recognition baselines over the top-15 languages. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.00899v1-abstract-full').style.display = 'none'; document.getElementById('2406.00899v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ASRU 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.20402">arXiv:2405.20402</a> <span> [<a href="https://arxiv.org/pdf/2405.20402">pdf</a>, <a href="https://arxiv.org/format/2405.20402">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Cross-Talk Reduction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+Z">Zhong-Qiu Wang</a>, <a href="/search/eess?searchtype=author&query=Kumar%2C+A">Anurag Kumar</a>, <a href="/search/eess?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.20402v1-abstract-short" style="display: inline;"> While far-field multi-talker mixtures are recorded, each speaker can wear a close-talk microphone so that close-talk mixtures can be recorded at the same time. Although each close-talk mixture has a high signal-to-noise ratio (SNR) of the wearer, it has a very limited range of applications, as it also contains significant cross-talk speech by other speakers and is not clean enough. In this context… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.20402v1-abstract-full').style.display = 'inline'; document.getElementById('2405.20402v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.20402v1-abstract-full" style="display: none;"> While far-field multi-talker mixtures are recorded, each speaker can wear a close-talk microphone so that close-talk mixtures can be recorded at the same time. Although each close-talk mixture has a high signal-to-noise ratio (SNR) of the wearer, it has a very limited range of applications, as it also contains significant cross-talk speech by other speakers and is not clean enough. In this context, we propose a novel task named cross-talk reduction (CTR) which aims at reducing cross-talk speech, and a novel solution named CTRnet which is based on unsupervised or weakly-supervised neural speech separation. In unsupervised CTRnet, close-talk and far-field mixtures are stacked as input for a DNN to estimate the close-talk speech of each speaker. It is trained in an unsupervised, discriminative way such that the DNN estimate for each speaker can be linearly filtered to cancel out the speaker's cross-talk speech captured at other microphones. In weakly-supervised CTRnet, we assume the availability of each speaker's activity timestamps during training, and leverage them to improve the training of unsupervised CTRnet. Evaluation results on a simulated two-speaker CTR task and on a real-recorded conversational speech separation and recognition task show the effectiveness and potential of CTRnet. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.20402v1-abstract-full').style.display = 'none'; document.getElementById('2405.20402v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">in International Joint Conference on Artificial Intelligence (IJCAI), 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.13514">arXiv:2405.13514</a> <span> [<a href="https://arxiv.org/pdf/2405.13514">pdf</a>, <a href="https://arxiv.org/format/2405.13514">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/ICASSPW62465.2024.10627382">10.1109/ICASSPW62465.2024.10627382 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Joint Optimization of Streaming and Non-Streaming Automatic Speech Recognition with Multi-Decoder and Knowledge Distillation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Shakeel%2C+M">Muhammad Shakeel</a>, <a href="/search/eess?searchtype=author&query=Sudo%2C+Y">Yui Sudo</a>, <a href="/search/eess?searchtype=author&query=Peng%2C+Y">Yifan Peng</a>, <a href="/search/eess?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.13514v1-abstract-short" style="display: inline;"> End-to-end (E2E) automatic speech recognition (ASR) can operate in two modes: streaming and non-streaming, each with its pros and cons. Streaming ASR processes the speech frames in real-time as it is being received, while non-streaming ASR waits for the entire speech utterance; thus, professionals may have to operate in either mode to satisfy their application. In this work, we present joint optim… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.13514v1-abstract-full').style.display = 'inline'; document.getElementById('2405.13514v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.13514v1-abstract-full" style="display: none;"> End-to-end (E2E) automatic speech recognition (ASR) can operate in two modes: streaming and non-streaming, each with its pros and cons. Streaming ASR processes the speech frames in real-time as it is being received, while non-streaming ASR waits for the entire speech utterance; thus, professionals may have to operate in either mode to satisfy their application. In this work, we present joint optimization of streaming and non-streaming ASR based on multi-decoder and knowledge distillation. Primarily, we study 1) the encoder integration of these ASR modules, followed by 2) separate decoders to make the switching mode flexible, and enhancing performance by 3) incorporating similarity-preserving knowledge distillation between the two modular encoders and decoders. Evaluation results show 2.6%-5.3% relative character error rate reductions (CERR) on CSJ for streaming ASR, and 8.3%-9.7% relative CERRs for non-streaming ASR within a single model compared to multiple standalone modules. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.13514v1-abstract-full').style.display = 'none'; document.getElementById('2405.13514v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to IEEE ICASSP 2024 workshop Hands-free Speech Communication and Microphone Arrays (HSCMA 2024)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.13344">arXiv:2405.13344</a> <span> [<a href="https://arxiv.org/pdf/2405.13344">pdf</a>, <a href="https://arxiv.org/format/2405.13344">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Contextualized Automatic Speech Recognition with Dynamic Vocabulary </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Sudo%2C+Y">Yui Sudo</a>, <a href="/search/eess?searchtype=author&query=Fukumoto%2C+Y">Yosuke Fukumoto</a>, <a href="/search/eess?searchtype=author&query=Shakeel%2C+M">Muhammad Shakeel</a>, <a href="/search/eess?searchtype=author&query=Peng%2C+Y">Yifan Peng</a>, <a href="/search/eess?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.13344v2-abstract-short" style="display: inline;"> Deep biasing (DB) enhances the performance of end-to-end automatic speech recognition (E2E-ASR) models for rare words or contextual phrases using a bias list. However, most existing methods treat bias phrases as sequences of subwords in a predefined static vocabulary. This naive sequence decomposition produces unnatural token patterns, significantly lowering their occurrence probability. More adva… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.13344v2-abstract-full').style.display = 'inline'; document.getElementById('2405.13344v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.13344v2-abstract-full" style="display: none;"> Deep biasing (DB) enhances the performance of end-to-end automatic speech recognition (E2E-ASR) models for rare words or contextual phrases using a bias list. However, most existing methods treat bias phrases as sequences of subwords in a predefined static vocabulary. This naive sequence decomposition produces unnatural token patterns, significantly lowering their occurrence probability. More advanced techniques address this problem by expanding the vocabulary with additional modules, including the external language model shallow fusion or rescoring. However, they result in increasing the workload due to the additional modules. This paper proposes a dynamic vocabulary where bias tokens can be added during inference. Each entry in a bias list is represented as a single token, unlike a sequence of existing subword tokens. This approach eliminates the need to learn subword dependencies within the bias phrases. This method is easily applied to various architectures because it only expands the embedding and output layers in common E2E-ASR architectures. Experimental results demonstrate that the proposed method improves the bias phrase WER on English and Japanese datasets by 3.1 -- 4.9 points compared with the conventional DB method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.13344v2-abstract-full').style.display = 'none'; document.getElementById('2405.13344v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.11078">arXiv:2405.11078</a> <span> [<a href="https://arxiv.org/pdf/2405.11078">pdf</a>, <a href="https://arxiv.org/ps/2405.11078">ps</a>, <a href="https://arxiv.org/format/2405.11078">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/0.1109/ICASSP.2019.8682556">0.1109/ICASSP.2019.8682556 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Acoustic modeling for Overlapping Speech Recognition: JHU Chime-5 Challenge System </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Manohar%2C+V">Vimal Manohar</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+S">Szu-Jui Chen</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Z">Zhiqi Wang</a>, <a href="/search/eess?searchtype=author&query=Fujita%2C+Y">Yusuke Fujita</a>, <a href="/search/eess?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a>, <a href="/search/eess?searchtype=author&query=Khudanpur%2C+S">Sanjeev Khudanpur</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.11078v1-abstract-short" style="display: inline;"> This paper summarizes our acoustic modeling efforts in the Johns Hopkins University speech recognition system for the CHiME-5 challenge to recognize highly-overlapped dinner party speech recorded by multiple microphone arrays. We explore data augmentation approaches, neural network architectures, front-end speech dereverberation, beamforming and robust i-vector extraction with comparisons of our i… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.11078v1-abstract-full').style.display = 'inline'; document.getElementById('2405.11078v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.11078v1-abstract-full" style="display: none;"> This paper summarizes our acoustic modeling efforts in the Johns Hopkins University speech recognition system for the CHiME-5 challenge to recognize highly-overlapped dinner party speech recorded by multiple microphone arrays. We explore data augmentation approaches, neural network architectures, front-end speech dereverberation, beamforming and robust i-vector extraction with comparisons of our in-house implementations and publicly available tools. We finally achieved a word error rate of 69.4% on the development set, which is a 11.7% absolute improvement over the previous baseline of 81.1%, and release this improved baseline with refined techniques/tools as an advanced CHiME-5 recipe. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.11078v1-abstract-full').style.display = 'none'; document.getElementById('2405.11078v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published in: ICASSP 2019 - 2019 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> ICASSP 2019 - 2019 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), Brighton, UK, 2019, pp. 6665-6669 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.09385">arXiv:2404.09385</a> <span> [<a href="https://arxiv.org/pdf/2404.09385">pdf</a>, <a href="https://arxiv.org/format/2404.09385">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> A Large-Scale Evaluation of Speech Foundation Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yang%2C+S">Shu-wen Yang</a>, <a href="/search/eess?searchtype=author&query=Chang%2C+H">Heng-Jui Chang</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+Z">Zili Huang</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+A+T">Andy T. Liu</a>, <a href="/search/eess?searchtype=author&query=Lai%2C+C">Cheng-I Lai</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+H">Haibin Wu</a>, <a href="/search/eess?searchtype=author&query=Shi%2C+J">Jiatong Shi</a>, <a href="/search/eess?searchtype=author&query=Chang%2C+X">Xuankai Chang</a>, <a href="/search/eess?searchtype=author&query=Tsai%2C+H">Hsiang-Sheng Tsai</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+W">Wen-Chin Huang</a>, <a href="/search/eess?searchtype=author&query=Feng%2C+T">Tzu-hsun Feng</a>, <a href="/search/eess?searchtype=author&query=Chi%2C+P">Po-Han Chi</a>, <a href="/search/eess?searchtype=author&query=Lin%2C+Y+Y">Yist Y. Lin</a>, <a href="/search/eess?searchtype=author&query=Chuang%2C+Y">Yung-Sung Chuang</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+T">Tzu-Hsien Huang</a>, <a href="/search/eess?searchtype=author&query=Tseng%2C+W">Wei-Cheng Tseng</a>, <a href="/search/eess?searchtype=author&query=Lakhotia%2C+K">Kushal Lakhotia</a>, <a href="/search/eess?searchtype=author&query=Li%2C+S">Shang-Wen Li</a>, <a href="/search/eess?searchtype=author&query=Mohamed%2C+A">Abdelrahman Mohamed</a>, <a href="/search/eess?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a>, <a href="/search/eess?searchtype=author&query=Lee%2C+H">Hung-yi Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.09385v2-abstract-short" style="display: inline;"> The foundation model paradigm leverages a shared foundation model to achieve state-of-the-art (SOTA) performance for various tasks, requiring minimal downstream-specific modeling and data annotation. This approach has proven crucial in the field of Natural Language Processing (NLP). However, the speech processing community lacks a similar setup to explore the paradigm systematically. In this work,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.09385v2-abstract-full').style.display = 'inline'; document.getElementById('2404.09385v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.09385v2-abstract-full" style="display: none;"> The foundation model paradigm leverages a shared foundation model to achieve state-of-the-art (SOTA) performance for various tasks, requiring minimal downstream-specific modeling and data annotation. This approach has proven crucial in the field of Natural Language Processing (NLP). However, the speech processing community lacks a similar setup to explore the paradigm systematically. In this work, we establish the Speech processing Universal PERformance Benchmark (SUPERB) to study the effectiveness of the paradigm for speech. We propose a unified multi-tasking framework to address speech processing tasks in SUPERB using a frozen foundation model followed by task-specialized, lightweight prediction heads. Combining our results with community submissions, we verify that the foundation model paradigm is promising for speech, and our multi-tasking framework is simple yet effective, as the best-performing foundation model shows competitive generalizability across most SUPERB tasks. For reproducibility and extensibility, we have developed a long-term maintained platform that enables deterministic benchmarking, allows for result sharing via an online leaderboard, and promotes collaboration through a community-driven benchmark database to support new development cycles. Finally, we conduct a series of analyses to offer an in-depth understanding of SUPERB and speech foundation models, including information flows across tasks inside the models, the correctness of the weighted-sum benchmarking protocol and the statistical significance and robustness of the benchmark. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.09385v2-abstract-full').style.display = 'none'; document.getElementById('2404.09385v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The extended journal version for SUPERB and SUPERB-SG. Published in IEEE/ACM TASLP. The Arxiv version is preferred</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.19207">arXiv:2403.19207</a> <span> [<a href="https://arxiv.org/pdf/2403.19207">pdf</a>, <a href="https://arxiv.org/format/2403.19207">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> LV-CTC: Non-autoregressive ASR with CTC and latent variable models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Fujita%2C+Y">Yuya Fujita</a>, <a href="/search/eess?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a>, <a href="/search/eess?searchtype=author&query=Chang%2C+X">Xuankai Chang</a>, <a href="/search/eess?searchtype=author&query=Maekaku%2C+T">Takashi Maekaku</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.19207v1-abstract-short" style="display: inline;"> Non-autoregressive (NAR) models for automatic speech recognition (ASR) aim to achieve high accuracy and fast inference by simplifying the autoregressive (AR) generation process of conventional models. Connectionist temporal classification (CTC) is one of the key techniques used in NAR ASR models. In this paper, we propose a new model combining CTC and a latent variable model, which is one of the s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.19207v1-abstract-full').style.display = 'inline'; document.getElementById('2403.19207v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.19207v1-abstract-full" style="display: none;"> Non-autoregressive (NAR) models for automatic speech recognition (ASR) aim to achieve high accuracy and fast inference by simplifying the autoregressive (AR) generation process of conventional models. Connectionist temporal classification (CTC) is one of the key techniques used in NAR ASR models. In this paper, we propose a new model combining CTC and a latent variable model, which is one of the state-of-the-art models in the neural machine translation research field. A new neural network architecture and formulation specialized for ASR application are introduced. In the proposed model, CTC alignment is assumed to be dependent on the latent variables that are expected to capture dependencies between tokens. Experimental results on a 100 hours subset of Librispeech corpus showed the best recognition accuracy among CTC-based NAR models. On the TED-LIUM2 corpus, the best recognition accuracy is achieved including AR E2E models with faster inference speed. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.19207v1-abstract-full').style.display = 'none'; document.getElementById('2403.19207v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.05887">arXiv:2403.05887</a> <span> [<a href="https://arxiv.org/pdf/2403.05887">pdf</a>, <a href="https://arxiv.org/format/2403.05887">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Aligning Speech to Languages to Enhance Code-switching Speech Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Liu%2C+H">Hexin Liu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+X">Xiangyu Zhang</a>, <a href="/search/eess?searchtype=author&query=Garcia%2C+L+P">Leibny Paola Garcia</a>, <a href="/search/eess?searchtype=author&query=Khong%2C+A+W+H">Andy W. H. Khong</a>, <a href="/search/eess?searchtype=author&query=Chng%2C+E+S">Eng Siong Chng</a>, <a href="/search/eess?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.05887v1-abstract-short" style="display: inline;"> Code-switching (CS) refers to the switching of languages within a speech signal and results in language confusion for automatic speech recognition (ASR). To address language confusion, we propose the language alignment loss that performs frame-level language identification using pseudo language labels learned from the ASR decoder. This eliminates the need for frame-level language annotations. To f… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.05887v1-abstract-full').style.display = 'inline'; document.getElementById('2403.05887v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.05887v1-abstract-full" style="display: none;"> Code-switching (CS) refers to the switching of languages within a speech signal and results in language confusion for automatic speech recognition (ASR). To address language confusion, we propose the language alignment loss that performs frame-level language identification using pseudo language labels learned from the ASR decoder. This eliminates the need for frame-level language annotations. To further tackle the complex token alternatives for language modeling in bilingual scenarios, we propose to employ large language models via a generative error correction method. A linguistic hint that incorporates language information (derived from the proposed language alignment loss and decoded hypotheses) is introduced to guide the prompting of large language models. The proposed methods are evaluated on the SEAME dataset and data from the ASRU 2019 Mandarin-English code-switching speech recognition challenge. The incorporation of the proposed language alignment loss demonstrates a higher CS-ASR performance with only a negligible increase in the number of parameters on both datasets compared to the baseline model. This work also highlights the efficacy of language alignment loss in balancing primary-language-dominant bilingual data during training, with an 8.6% relative improvement on the ASRU dataset compared to the baseline model. Performance evaluation using large language models reveals the advantage of the linguistic hint by achieving 14.1% and 5.5% relative improvement on test sets of the ASRU and SEAME datasets, respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.05887v1-abstract-full').style.display = 'none'; document.getElementById('2403.05887v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Manuscript submitted to IEEE/ACM Transactions on Audio, Speech, and Language Processing</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Watanabe%2C+S&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Watanabe%2C+S&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Watanabe%2C+S&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Watanabe%2C+S&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Watanabe%2C+S&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Watanabe%2C+S&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li> <a href="/search/?searchtype=author&query=Watanabe%2C+S&start=250" class="pagination-link " aria-label="Page 6" aria-current="page">6 </a> </li> <li> <a href="/search/?searchtype=author&query=Watanabe%2C+S&start=300" class="pagination-link " aria-label="Page 7" aria-current="page">7 </a> </li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>