Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 73 results for author: <span class="mathjax">Ginsburg, B</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/eess" aria-role="search"> Searching in archive <strong>eess</strong>. <a href="/search/?searchtype=author&query=Ginsburg%2C+B">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Ginsburg, B"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Ginsburg%2C+B&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Ginsburg, B"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Ginsburg%2C+B&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Ginsburg%2C+B&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Ginsburg%2C+B&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.14788">arXiv:2501.14788</a> <span> [<a href="https://arxiv.org/pdf/2501.14788">pdf</a>, <a href="https://arxiv.org/ps/2501.14788">ps</a>, <a href="https://arxiv.org/format/2501.14788">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Methods to Increase the Amount of Data for Speech Recognition for Low Resource Languages </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ayrapetyan%2C+A">Alexan Ayrapetyan</a>, <a href="/search/eess?searchtype=author&query=Kostandian%2C+S">Sofia Kostandian</a>, <a href="/search/eess?searchtype=author&query=Yeroyan%2C+A">Ara Yeroyan</a>, <a href="/search/eess?searchtype=author&query=Yerznkanyan%2C+M">Mher Yerznkanyan</a>, <a href="/search/eess?searchtype=author&query=Karpov%2C+N">Nikolay Karpov</a>, <a href="/search/eess?searchtype=author&query=Tadevosyan%2C+N">Nune Tadevosyan</a>, <a href="/search/eess?searchtype=author&query=Lavrukhin%2C+V">Vitaly Lavrukhin</a>, <a href="/search/eess?searchtype=author&query=Ginsburg%2C+B">Boris Ginsburg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.14788v2-abstract-short" style="display: inline;"> This study explores methods to increase data volume for low-resource languages using techniques such as crowdsourcing, pseudo-labeling, advanced data preprocessing and various permissive data sources such as audiobooks, Common Voice, YouTube. While these methods are well-explored for highresource languages, their application for low-resource languages remains underexplored. Using Armenian and Geor… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14788v2-abstract-full').style.display = 'inline'; document.getElementById('2501.14788v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.14788v2-abstract-full" style="display: none;"> This study explores methods to increase data volume for low-resource languages using techniques such as crowdsourcing, pseudo-labeling, advanced data preprocessing and various permissive data sources such as audiobooks, Common Voice, YouTube. While these methods are well-explored for highresource languages, their application for low-resource languages remains underexplored. Using Armenian and Georgian as case studies, we demonstrate how linguistic and resource-specific characteristics influence the success of these methods. This work provides practical guidance for researchers to choose cost-effective and quality-driven dataset extension strategies for low-resource languages. The key takeaway from various data extension approaches is that paid crowd-sourcing offers the best balance between cost and quality, outperforming volunteer crowd-sourcing, open-source audiobooks, and unlabeled data usage. Ablation study shows that models trained on the expanded datasets outperform existing baselines and achieve 5.73% for Gergian and 9.9% for Armenian ASR word error rate using a relatively small FastConformer architecture. We open-sourced both the Armenian and Georgian models to allow further research and practical applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14788v2-abstract-full').style.display = 'none'; document.getElementById('2501.14788v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The first four authors contributed equally</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05945">arXiv:2411.05945</a> <span> [<a href="https://arxiv.org/pdf/2411.05945">pdf</a>, <a href="https://arxiv.org/format/2411.05945">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> NeKo: Toward Post Recognition Generative Correction Large Language Models with Task-Oriented Experts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Lin%2C+Y">Yen-Ting Lin</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+C+H">Chao-Han Huck Yang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Z">Zhehuai Chen</a>, <a href="/search/eess?searchtype=author&query=Zelasko%2C+P">Piotr Zelasko</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+X">Xuesong Yang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Z">Zih-Ching Chen</a>, <a href="/search/eess?searchtype=author&query=Puvvada%2C+K+C">Krishna C Puvvada</a>, <a href="/search/eess?searchtype=author&query=Fu%2C+S">Szu-Wei Fu</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+K">Ke Hu</a>, <a href="/search/eess?searchtype=author&query=Chiu%2C+J+W">Jun Wei Chiu</a>, <a href="/search/eess?searchtype=author&query=Balam%2C+J">Jagadeesh Balam</a>, <a href="/search/eess?searchtype=author&query=Ginsburg%2C+B">Boris Ginsburg</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y+F">Yu-Chiang Frank Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05945v1-abstract-short" style="display: inline;"> Construction of a general-purpose post-recognition error corrector poses a crucial question: how can we most effectively train a model on a large mixture of domain datasets? The answer would lie in learning dataset-specific features and digesting their knowledge in a single model. Previous methods achieve this by having separate correction language models, resulting in a significant increase in pa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05945v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05945v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05945v1-abstract-full" style="display: none;"> Construction of a general-purpose post-recognition error corrector poses a crucial question: how can we most effectively train a model on a large mixture of domain datasets? The answer would lie in learning dataset-specific features and digesting their knowledge in a single model. Previous methods achieve this by having separate correction language models, resulting in a significant increase in parameters. In this work, we present Mixture-of-Experts as a solution, highlighting that MoEs are much more than a scalability tool. We propose a Multi-Task Correction MoE, where we train the experts to become an ``expert'' of speech-to-text, language-to-text and vision-to-text datasets by learning to route each dataset's tokens to its mapped expert. Experiments on the Open ASR Leaderboard show that we explore a new state-of-the-art performance by achieving an average relative $5.0$% WER reduction and substantial improvements in BLEU scores for speech and translation tasks. On zero-shot evaluation, NeKo outperforms GPT-3.5 and Claude-Opus with $15.5$% to $27.6$% relative WER reduction in the Hyporadise benchmark. NeKo performs competitively on grammar and post-OCR correction as a multi-task model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05945v1-abstract-full').style.display = 'none'; document.getElementById('2411.05945v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeKo work has been done in June 2024. NeKo LMs will be open source on https://huggingface.co/nvidia under the MIT license</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.17485">arXiv:2410.17485</a> <span> [<a href="https://arxiv.org/pdf/2410.17485">pdf</a>, <a href="https://arxiv.org/format/2410.17485">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> VoiceTextBlender: Augmenting Large Language Models with Speech Capabilities via Single-Stage Joint Speech-Text Supervised Fine-Tuning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Peng%2C+Y">Yifan Peng</a>, <a href="/search/eess?searchtype=author&query=Puvvada%2C+K+C">Krishna C. Puvvada</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Z">Zhehuai Chen</a>, <a href="/search/eess?searchtype=author&query=Zelasko%2C+P">Piotr Zelasko</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+H">He Huang</a>, <a href="/search/eess?searchtype=author&query=Dhawan%2C+K">Kunal Dhawan</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+K">Ke Hu</a>, <a href="/search/eess?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a>, <a href="/search/eess?searchtype=author&query=Balam%2C+J">Jagadeesh Balam</a>, <a href="/search/eess?searchtype=author&query=Ginsburg%2C+B">Boris Ginsburg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.17485v2-abstract-short" style="display: inline;"> Recent studies have augmented large language models (LLMs) with speech capabilities, leading to the development of speech language models (SpeechLMs). Earlier SpeechLMs focused on single-turn speech-based question answering (QA), where user input comprised a speech context and a text question. More recent studies have extended this to multi-turn conversations, though they often require complex, mu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17485v2-abstract-full').style.display = 'inline'; document.getElementById('2410.17485v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.17485v2-abstract-full" style="display: none;"> Recent studies have augmented large language models (LLMs) with speech capabilities, leading to the development of speech language models (SpeechLMs). Earlier SpeechLMs focused on single-turn speech-based question answering (QA), where user input comprised a speech context and a text question. More recent studies have extended this to multi-turn conversations, though they often require complex, multi-stage supervised fine-tuning (SFT) with diverse data. Another critical challenge with SpeechLMs is catastrophic forgetting, where models optimized for speech tasks suffer significant degradation in text-only performance. To mitigate these issues, we propose a novel single-stage joint speech-text SFT approach on the low-rank adaptation (LoRA) of the LLM backbone. Our joint SFT combines text-only SFT data with three types of speech-related data: speech recognition and translation, speech-based QA, and mixed-modal SFT. Compared to previous SpeechLMs with 7B or 13B parameters, our 3B model demonstrates superior performance across various speech benchmarks while preserving the original capabilities on text-only tasks. Furthermore, our model shows emergent abilities of effectively handling previously unseen prompts and tasks, including multi-turn, mixed-modal inputs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17485v2-abstract-full').style.display = 'none'; document.getElementById('2410.17485v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at NAACL 2025 main conference</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.20007">arXiv:2409.20007</a> <span> [<a href="https://arxiv.org/pdf/2409.20007">pdf</a>, <a href="https://arxiv.org/format/2409.20007">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> DeSTA2: Developing Instruction-Following Speech Language Model Without Speech Instruction-Tuning Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Lu%2C+K">Ke-Han Lu</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Z">Zhehuai Chen</a>, <a href="/search/eess?searchtype=author&query=Fu%2C+S">Szu-Wei Fu</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+C+H">Chao-Han Huck Yang</a>, <a href="/search/eess?searchtype=author&query=Balam%2C+J">Jagadeesh Balam</a>, <a href="/search/eess?searchtype=author&query=Ginsburg%2C+B">Boris Ginsburg</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y+F">Yu-Chiang Frank Wang</a>, <a href="/search/eess?searchtype=author&query=Lee%2C+H">Hung-yi Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.20007v2-abstract-short" style="display: inline;"> Recent end-to-end speech language models (SLMs) have expanded upon the capabilities of large language models (LLMs) by incorporating pre-trained speech models. However, these SLMs often undergo extensive speech instruction-tuning to bridge the gap between speech and text modalities. This requires significant annotation efforts and risks catastrophic forgetting of the original language capabilities… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.20007v2-abstract-full').style.display = 'inline'; document.getElementById('2409.20007v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.20007v2-abstract-full" style="display: none;"> Recent end-to-end speech language models (SLMs) have expanded upon the capabilities of large language models (LLMs) by incorporating pre-trained speech models. However, these SLMs often undergo extensive speech instruction-tuning to bridge the gap between speech and text modalities. This requires significant annotation efforts and risks catastrophic forgetting of the original language capabilities. In this work, we present a simple yet effective automatic process for creating speech-text pair data that carefully injects speech paralinguistic understanding abilities into SLMs while preserving the inherent language capabilities of the text-based LLM. Our model demonstrates general capabilities for speech-related tasks without the need for speech instruction-tuning data, achieving impressive performance on Dynamic-SUPERB and AIR-Bench-Chat benchmarks. Furthermore, our model exhibits the ability to follow complex instructions derived from LLMs, such as specific output formatting and chain-of-thought reasoning. Our approach not only enhances the versatility and effectiveness of SLMs but also reduces reliance on extensive annotated datasets, paving the way for more efficient and capable speech understanding systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.20007v2-abstract-full').style.display = 'none'; document.getElementById('2409.20007v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICASSP 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.13523">arXiv:2409.13523</a> <span> [<a href="https://arxiv.org/pdf/2409.13523">pdf</a>, <a href="https://arxiv.org/format/2409.13523">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> EMMeTT: Efficient Multimodal Machine Translation Training </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=%C5%BBelasko%2C+P">Piotr 呕elasko</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Z">Zhehuai Chen</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+M">Mengru Wang</a>, <a href="/search/eess?searchtype=author&query=Galvez%2C+D">Daniel Galvez</a>, <a href="/search/eess?searchtype=author&query=Hrinchuk%2C+O">Oleksii Hrinchuk</a>, <a href="/search/eess?searchtype=author&query=Ding%2C+S">Shuoyang Ding</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+K">Ke Hu</a>, <a href="/search/eess?searchtype=author&query=Balam%2C+J">Jagadeesh Balam</a>, <a href="/search/eess?searchtype=author&query=Lavrukhin%2C+V">Vitaly Lavrukhin</a>, <a href="/search/eess?searchtype=author&query=Ginsburg%2C+B">Boris Ginsburg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.13523v1-abstract-short" style="display: inline;"> A rising interest in the modality extension of foundation language models warrants discussion on the most effective, and efficient, multimodal training approach. This work focuses on neural machine translation (NMT) and proposes a joint multimodal training regime of Speech-LLM to include automatic speech translation (AST). We investigate two different foundation model architectures, decoder-only G… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13523v1-abstract-full').style.display = 'inline'; document.getElementById('2409.13523v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.13523v1-abstract-full" style="display: none;"> A rising interest in the modality extension of foundation language models warrants discussion on the most effective, and efficient, multimodal training approach. This work focuses on neural machine translation (NMT) and proposes a joint multimodal training regime of Speech-LLM to include automatic speech translation (AST). We investigate two different foundation model architectures, decoder-only GPT and encoder-decoder T5, extended with Canary-1B's speech encoder. To handle joint multimodal training, we propose a novel training framework called EMMeTT. EMMeTT improves training efficiency with the following: balanced sampling across languages, datasets, and modalities; efficient sequential data iteration; and a novel 2D bucketing scheme for multimodal data, complemented by a batch size optimizer (OOMptimizer). We show that a multimodal training consistently helps with both architectures. Moreover, SALM-T5 trained with EMMeTT retains the original NMT capability while outperforming AST baselines on four-language subsets of FLORES and FLEURS. The resultant Multimodal Translation Model produces strong text and speech translation results at the same time. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13523v1-abstract-full').style.display = 'none'; document.getElementById('2409.13523v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">4 pages, submitted to ICASSP 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.12352">arXiv:2409.12352</a> <span> [<a href="https://arxiv.org/pdf/2409.12352">pdf</a>, <a href="https://arxiv.org/format/2409.12352">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> META-CAT: Speaker-Informed Speech Embeddings via Meta Information Concatenation for Multi-talker ASR </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+J">Jinhan Wang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+W">Weiqing Wang</a>, <a href="/search/eess?searchtype=author&query=Dhawan%2C+K">Kunal Dhawan</a>, <a href="/search/eess?searchtype=author&query=Park%2C+T">Taejin Park</a>, <a href="/search/eess?searchtype=author&query=Kim%2C+M">Myungjong Kim</a>, <a href="/search/eess?searchtype=author&query=Medennikov%2C+I">Ivan Medennikov</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+H">He Huang</a>, <a href="/search/eess?searchtype=author&query=Koluguri%2C+N">Nithin Koluguri</a>, <a href="/search/eess?searchtype=author&query=Balam%2C+J">Jagadeesh Balam</a>, <a href="/search/eess?searchtype=author&query=Ginsburg%2C+B">Boris Ginsburg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.12352v1-abstract-short" style="display: inline;"> We propose a novel end-to-end multi-talker automatic speech recognition (ASR) framework that enables both multi-speaker (MS) ASR and target-speaker (TS) ASR. Our proposed model is trained in a fully end-to-end manner, incorporating speaker supervision from a pre-trained speaker diarization module. We introduce an intuitive yet effective method for masking ASR encoder activations using output from… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12352v1-abstract-full').style.display = 'inline'; document.getElementById('2409.12352v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.12352v1-abstract-full" style="display: none;"> We propose a novel end-to-end multi-talker automatic speech recognition (ASR) framework that enables both multi-speaker (MS) ASR and target-speaker (TS) ASR. Our proposed model is trained in a fully end-to-end manner, incorporating speaker supervision from a pre-trained speaker diarization module. We introduce an intuitive yet effective method for masking ASR encoder activations using output from the speaker supervision module, a technique we term Meta-Cat (meta-information concatenation), that can be applied to both MS-ASR and TS-ASR. Our results demonstrate that the proposed architecture achieves competitive performance in both MS-ASR and TS-ASR tasks, without the need for traditional methods, such as neural mask estimation or masking at the audio or feature level. Furthermore, we demonstrate a glimpse of a unified dual-task model which can efficiently handle both MS-ASR and TS-ASR tasks. Thus, this work illustrates that a robust end-to-end multi-talker ASR framework can be implemented with a streamlined architecture, obviating the need for the complex speaker filtering mechanisms employed in previous studies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12352v1-abstract-full').style.display = 'none'; document.getElementById('2409.12352v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.09785">arXiv:2409.09785</a> <span> [<a href="https://arxiv.org/pdf/2409.09785">pdf</a>, <a href="https://arxiv.org/format/2409.09785">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Large Language Model Based Generative Error Correction: A Challenge and Baselines for Speech Recognition, Speaker Tagging, and Emotion Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yang%2C+C+H">Chao-Han Huck Yang</a>, <a href="/search/eess?searchtype=author&query=Park%2C+T">Taejin Park</a>, <a href="/search/eess?searchtype=author&query=Gong%2C+Y">Yuan Gong</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yuanchao Li</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Z">Zhehuai Chen</a>, <a href="/search/eess?searchtype=author&query=Lin%2C+Y">Yen-Ting Lin</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+C">Chen Chen</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+Y">Yuchen Hu</a>, <a href="/search/eess?searchtype=author&query=Dhawan%2C+K">Kunal Dhawan</a>, <a href="/search/eess?searchtype=author&query=%C5%BBelasko%2C+P">Piotr 呕elasko</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+C">Chao Zhang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yun-Nung Chen</a>, <a href="/search/eess?searchtype=author&query=Tsao%2C+Y">Yu Tsao</a>, <a href="/search/eess?searchtype=author&query=Balam%2C+J">Jagadeesh Balam</a>, <a href="/search/eess?searchtype=author&query=Ginsburg%2C+B">Boris Ginsburg</a>, <a href="/search/eess?searchtype=author&query=Siniscalchi%2C+S+M">Sabato Marco Siniscalchi</a>, <a href="/search/eess?searchtype=author&query=Chng%2C+E+S">Eng Siong Chng</a>, <a href="/search/eess?searchtype=author&query=Bell%2C+P">Peter Bell</a>, <a href="/search/eess?searchtype=author&query=Lai%2C+C">Catherine Lai</a>, <a href="/search/eess?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a>, <a href="/search/eess?searchtype=author&query=Stolcke%2C+A">Andreas Stolcke</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.09785v3-abstract-short" style="display: inline;"> Given recent advances in generative AI technology, a key question is how large language models (LLMs) can enhance acoustic modeling tasks using text decoding results from a frozen, pretrained automatic speech recognition (ASR) model. To explore new capabilities in language modeling for speech processing, we introduce the generative speech transcription error correction (GenSEC) challenge. This cha… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09785v3-abstract-full').style.display = 'inline'; document.getElementById('2409.09785v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.09785v3-abstract-full" style="display: none;"> Given recent advances in generative AI technology, a key question is how large language models (LLMs) can enhance acoustic modeling tasks using text decoding results from a frozen, pretrained automatic speech recognition (ASR) model. To explore new capabilities in language modeling for speech processing, we introduce the generative speech transcription error correction (GenSEC) challenge. This challenge comprises three post-ASR language modeling tasks: (i) post-ASR transcription correction, (ii) speaker tagging, and (iii) emotion recognition. These tasks aim to emulate future LLM-based agents handling voice-based interfaces while remaining accessible to a broad audience by utilizing open pretrained language models or agent-based APIs. We also discuss insights from baseline evaluations, as well as lessons learned for designing future evaluations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09785v3-abstract-full').style.display = 'none'; document.getElementById('2409.09785v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">IEEE SLT 2024. The initial draft version has been done in December 2023. Post-ASR Text Processing and Understanding Community and LlaMA-7B pre-training correction model: https://huggingface.co/GenSEC-LLM/SLT-Task1-Llama2-7b-HyPo-baseline</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.06656">arXiv:2409.06656</a> <span> [<a href="https://arxiv.org/pdf/2409.06656">pdf</a>, <a href="https://arxiv.org/format/2409.06656">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Sortformer: Seamless Integration of Speaker Diarization and ASR by Bridging Timestamps and Tokens </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Park%2C+T">Taejin Park</a>, <a href="/search/eess?searchtype=author&query=Medennikov%2C+I">Ivan Medennikov</a>, <a href="/search/eess?searchtype=author&query=Dhawan%2C+K">Kunal Dhawan</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+W">Weiqing Wang</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+H">He Huang</a>, <a href="/search/eess?searchtype=author&query=Koluguri%2C+N+R">Nithin Rao Koluguri</a>, <a href="/search/eess?searchtype=author&query=Puvvada%2C+K+C">Krishna C. Puvvada</a>, <a href="/search/eess?searchtype=author&query=Balam%2C+J">Jagadeesh Balam</a>, <a href="/search/eess?searchtype=author&query=Ginsburg%2C+B">Boris Ginsburg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.06656v2-abstract-short" style="display: inline;"> We propose Sortformer, a novel neural model for speaker diarization, trained with unconventional objectives compared to existing end-to-end diarization models. The permutation problem in speaker diarization has long been regarded as a critical challenge. Most prior end-to-end diarization systems employ permutation invariant loss (PIL), which optimizes for the permutation that yields the lowest err… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.06656v2-abstract-full').style.display = 'inline'; document.getElementById('2409.06656v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.06656v2-abstract-full" style="display: none;"> We propose Sortformer, a novel neural model for speaker diarization, trained with unconventional objectives compared to existing end-to-end diarization models. The permutation problem in speaker diarization has long been regarded as a critical challenge. Most prior end-to-end diarization systems employ permutation invariant loss (PIL), which optimizes for the permutation that yields the lowest error. In contrast, we introduce Sort Loss, which enables a diarization model to autonomously resolve permutation, with or without PIL. We demonstrate that combining Sort Loss and PIL achieves performance competitive with state-of-the-art end-to-end diarization models trained exclusively with PIL. Crucially, we present a streamlined multispeaker ASR architecture that leverages Sortformer as a speaker supervision model, embedding speaker label estimation within the ASR encoder state using a sinusoidal kernel function. This approach resolves the speaker permutation problem through sorted objectives, effectively bridging speaker-label timestamps and speaker tokens. In our experiments, we show that the proposed multispeaker ASR architecture, enhanced with speaker supervision, improves performance via adapter techniques. Code and trained models will be made publicly available via the NVIDIA NeMo framework. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.06656v2-abstract-full').style.display = 'none'; document.getElementById('2409.06656v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.05601">arXiv:2409.05601</a> <span> [<a href="https://arxiv.org/pdf/2409.05601">pdf</a>, <a href="https://arxiv.org/format/2409.05601">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Longer is (Not Necessarily) Stronger: Punctuated Long-Sequence Training for Enhanced Speech Recognition and Translation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Koluguri%2C+N+R">Nithin Rao Koluguri</a>, <a href="/search/eess?searchtype=author&query=Bartley%2C+T">Travis Bartley</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+H">Hainan Xu</a>, <a href="/search/eess?searchtype=author&query=Hrinchuk%2C+O">Oleksii Hrinchuk</a>, <a href="/search/eess?searchtype=author&query=Balam%2C+J">Jagadeesh Balam</a>, <a href="/search/eess?searchtype=author&query=Ginsburg%2C+B">Boris Ginsburg</a>, <a href="/search/eess?searchtype=author&query=Kucsko%2C+G">Georg Kucsko</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.05601v1-abstract-short" style="display: inline;"> This paper presents a new method for training sequence-to-sequence models for speech recognition and translation tasks. Instead of the traditional approach of training models on short segments containing only lowercase or partial punctuation and capitalization (PnC) sentences, we propose training on longer utterances that include complete sentences with proper punctuation and capitalization. We ac… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05601v1-abstract-full').style.display = 'inline'; document.getElementById('2409.05601v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.05601v1-abstract-full" style="display: none;"> This paper presents a new method for training sequence-to-sequence models for speech recognition and translation tasks. Instead of the traditional approach of training models on short segments containing only lowercase or partial punctuation and capitalization (PnC) sentences, we propose training on longer utterances that include complete sentences with proper punctuation and capitalization. We achieve this by using the FastConformer architecture which allows training 1 Billion parameter models with sequences up to 60 seconds long with full attention. However, while training with PnC enhances the overall performance, we observed that accuracy plateaus when training on sequences longer than 40 seconds across various evaluation settings. Our proposed method significantly improves punctuation and capitalization accuracy, showing a 25% relative word error rate (WER) improvement on the Earnings-21 and Earnings-22 benchmarks. Additionally, training on longer audio segments increases the overall model accuracy across speech recognition and translation benchmarks. The model weights and training code are open-sourced though NVIDIA NeMo. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05601v1-abstract-full').style.display = 'none'; document.getElementById('2409.05601v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at SLT 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.01438">arXiv:2409.01438</a> <span> [<a href="https://arxiv.org/pdf/2409.01438">pdf</a>, <a href="https://arxiv.org/format/2409.01438">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Resource-Efficient Adaptation of Speech Foundation Models for Multi-Speaker ASR </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+W">Weiqing Wang</a>, <a href="/search/eess?searchtype=author&query=Dhawan%2C+K">Kunal Dhawan</a>, <a href="/search/eess?searchtype=author&query=Park%2C+T">Taejin Park</a>, <a href="/search/eess?searchtype=author&query=Puvvada%2C+K+C">Krishna C. Puvvada</a>, <a href="/search/eess?searchtype=author&query=Medennikov%2C+I">Ivan Medennikov</a>, <a href="/search/eess?searchtype=author&query=Majumdar%2C+S">Somshubra Majumdar</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+H">He Huang</a>, <a href="/search/eess?searchtype=author&query=Balam%2C+J">Jagadeesh Balam</a>, <a href="/search/eess?searchtype=author&query=Ginsburg%2C+B">Boris Ginsburg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.01438v2-abstract-short" style="display: inline;"> Speech foundation models have achieved state-of-the-art (SoTA) performance across various tasks, such as automatic speech recognition (ASR) in hundreds of languages. However, multi-speaker ASR remains a challenging task for these models due to data scarcity and sparsity. In this paper, we present approaches to enable speech foundation models to process and understand multi-speaker speech with limi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.01438v2-abstract-full').style.display = 'inline'; document.getElementById('2409.01438v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.01438v2-abstract-full" style="display: none;"> Speech foundation models have achieved state-of-the-art (SoTA) performance across various tasks, such as automatic speech recognition (ASR) in hundreds of languages. However, multi-speaker ASR remains a challenging task for these models due to data scarcity and sparsity. In this paper, we present approaches to enable speech foundation models to process and understand multi-speaker speech with limited training data. Specifically, we adapt a speech foundation model for the multi-speaker ASR task using only telephonic data. Remarkably, the adapted model also performs well on meeting data without any fine-tuning, demonstrating the generalization ability of our approach. We conduct several ablation studies to analyze the impact of different parameters and strategies on model performance. Our findings highlight the effectiveness of our methods. Results show that less parameters give better overall cpWER, which, although counter-intuitive, provides insights into adapting speech foundation models for multi-speaker ASR tasks with minimal annotated data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.01438v2-abstract-full').style.display = 'none'; document.getElementById('2409.01438v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by SLT 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.13106">arXiv:2408.13106</a> <span> [<a href="https://arxiv.org/pdf/2408.13106">pdf</a>, <a href="https://arxiv.org/format/2408.13106">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> NEST: Self-supervised Fast Conformer as All-purpose Seasoning to Speech Processing Tasks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Huang%2C+H">He Huang</a>, <a href="/search/eess?searchtype=author&query=Park%2C+T">Taejin Park</a>, <a href="/search/eess?searchtype=author&query=Dhawan%2C+K">Kunal Dhawan</a>, <a href="/search/eess?searchtype=author&query=Medennikov%2C+I">Ivan Medennikov</a>, <a href="/search/eess?searchtype=author&query=Puvvada%2C+K+C">Krishna C. Puvvada</a>, <a href="/search/eess?searchtype=author&query=Koluguri%2C+N+R">Nithin Rao Koluguri</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+W">Weiqing Wang</a>, <a href="/search/eess?searchtype=author&query=Balam%2C+J">Jagadeesh Balam</a>, <a href="/search/eess?searchtype=author&query=Ginsburg%2C+B">Boris Ginsburg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.13106v6-abstract-short" style="display: inline;"> Self-supervised learning has been proved to benefit a wide range of speech processing tasks, such as speech recognition/translation, speaker verification and diarization, etc. However, most of current approaches are computationally expensive. In this paper, we propose a simplified and more efficient self-supervised learning framework termed as NeMo Encoder for Speech Tasks (NEST). Specifically, we… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.13106v6-abstract-full').style.display = 'inline'; document.getElementById('2408.13106v6-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.13106v6-abstract-full" style="display: none;"> Self-supervised learning has been proved to benefit a wide range of speech processing tasks, such as speech recognition/translation, speaker verification and diarization, etc. However, most of current approaches are computationally expensive. In this paper, we propose a simplified and more efficient self-supervised learning framework termed as NeMo Encoder for Speech Tasks (NEST). Specifically, we adopt the FastConformer architecture with 8x sub-sampling rate, which is faster than Transformer or Conformer architectures. Instead of clustering-based quantization, we use fixed random projection for its simplicity and effectiveness. We also implement a generalized noisy speech augmentation that teaches the model to disentangle the main speaker from noise or other speakers. Experiments show that \model improves over existing self-supervised models and achieves new state-of-the-art performance on a variety of speech processing tasks, such as speech recognition/translation, speaker diarization, spoken language understanding, etc. Code and checkpoints are publicly available via NVIDIA NeMo framework. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.13106v6-abstract-full').style.display = 'none'; document.getElementById('2408.13106v6-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published in ICASSP 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.16074">arXiv:2407.16074</a> <span> [<a href="https://arxiv.org/pdf/2407.16074">pdf</a>, <a href="https://arxiv.org/ps/2407.16074">ps</a>, <a href="https://arxiv.org/format/2407.16074">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Schr枚dinger Bridge for Generative Speech Enhancement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Juki%C4%87%2C+A">Ante Juki膰</a>, <a href="/search/eess?searchtype=author&query=Korostik%2C+R">Roman Korostik</a>, <a href="/search/eess?searchtype=author&query=Balam%2C+J">Jagadeesh Balam</a>, <a href="/search/eess?searchtype=author&query=Ginsburg%2C+B">Boris Ginsburg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.16074v1-abstract-short" style="display: inline;"> This paper proposes a generative speech enhancement model based on Schr枚dinger bridge (SB). The proposed model is employing a tractable SB to formulate a data-to-data process between the clean speech distribution and the observed noisy speech distribution. The model is trained with a data prediction loss, aiming to recover the complex-valued clean speech coefficients, and an auxiliary time-domain… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.16074v1-abstract-full').style.display = 'inline'; document.getElementById('2407.16074v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.16074v1-abstract-full" style="display: none;"> This paper proposes a generative speech enhancement model based on Schr枚dinger bridge (SB). The proposed model is employing a tractable SB to formulate a data-to-data process between the clean speech distribution and the observed noisy speech distribution. The model is trained with a data prediction loss, aiming to recover the complex-valued clean speech coefficients, and an auxiliary time-domain loss is used to improve training of the model. The effectiveness of the proposed SB-based model is evaluated in two different speech enhancement tasks: speech denoising and speech dereverberation. The experimental results demonstrate that the proposed SB-based outperforms diffusion-based models in terms of speech quality metrics and ASR performance, e.g., resulting in relative word error rate reduction of 20% for denoising and 6% for dereverberation compared to the best baseline model. The proposed model also demonstrates improved efficiency, achieving better quality than the baselines for the same number of sampling steps and with a reduced computational cost. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.16074v1-abstract-full').style.display = 'none'; document.getElementById('2407.16074v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.04368">arXiv:2407.04368</a> <span> [<a href="https://arxiv.org/pdf/2407.04368">pdf</a>, <a href="https://arxiv.org/format/2407.04368">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Romanization Encoding For Multilingual ASR </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ding%2C+W">Wen Ding</a>, <a href="/search/eess?searchtype=author&query=Jia%2C+F">Fei Jia</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+H">Hainan Xu</a>, <a href="/search/eess?searchtype=author&query=Xi%2C+Y">Yu Xi</a>, <a href="/search/eess?searchtype=author&query=Lai%2C+J">Junjie Lai</a>, <a href="/search/eess?searchtype=author&query=Ginsburg%2C+B">Boris Ginsburg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.04368v2-abstract-short" style="display: inline;"> We introduce romanization encoding for script-heavy languages to optimize multilingual and code-switching Automatic Speech Recognition (ASR) systems. By adopting romanization encoding alongside a balanced concatenated tokenizer within a FastConformer-RNNT framework equipped with a Roman2Char module, we significantly reduce vocabulary and output dimensions, enabling larger training batches and redu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.04368v2-abstract-full').style.display = 'inline'; document.getElementById('2407.04368v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.04368v2-abstract-full" style="display: none;"> We introduce romanization encoding for script-heavy languages to optimize multilingual and code-switching Automatic Speech Recognition (ASR) systems. By adopting romanization encoding alongside a balanced concatenated tokenizer within a FastConformer-RNNT framework equipped with a Roman2Char module, we significantly reduce vocabulary and output dimensions, enabling larger training batches and reduced memory consumption. Our method decouples acoustic modeling and language modeling, enhancing the flexibility and adaptability of the system. In our study, applying this method to Mandarin-English ASR resulted in a remarkable 63.51% vocabulary reduction and notable performance gains of 13.72% and 15.03% on SEAME code-switching benchmarks. Ablation studies on Mandarin-Korean and Mandarin-Japanese highlight our method's strong capability to address the complexities of other script-heavy languages, paving the way for more versatile and effective multilingual ASR systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.04368v2-abstract-full').style.display = 'none'; document.getElementById('2407.04368v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by IEEE SLT2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.03495">arXiv:2407.03495</a> <span> [<a href="https://arxiv.org/pdf/2407.03495">pdf</a>, <a href="https://arxiv.org/format/2407.03495">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.21437/Interspeech.2024-330">10.21437/Interspeech.2024-330 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Codec-ASR: Training Performant Automatic Speech Recognition Systems with Discrete Speech Representations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Dhawan%2C+K">Kunal Dhawan</a>, <a href="/search/eess?searchtype=author&query=Koluguri%2C+N+R">Nithin Rao Koluguri</a>, <a href="/search/eess?searchtype=author&query=Juki%C4%87%2C+A">Ante Juki膰</a>, <a href="/search/eess?searchtype=author&query=Langman%2C+R">Ryan Langman</a>, <a href="/search/eess?searchtype=author&query=Balam%2C+J">Jagadeesh Balam</a>, <a href="/search/eess?searchtype=author&query=Ginsburg%2C+B">Boris Ginsburg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.03495v1-abstract-short" style="display: inline;"> Discrete speech representations have garnered recent attention for their efficacy in training transformer-based models for various speech-related tasks such as automatic speech recognition (ASR), translation, speaker verification, and joint speech-text foundational models. In this work, we present a comprehensive analysis on building ASR systems with discrete codes. We investigate different method… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.03495v1-abstract-full').style.display = 'inline'; document.getElementById('2407.03495v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.03495v1-abstract-full" style="display: none;"> Discrete speech representations have garnered recent attention for their efficacy in training transformer-based models for various speech-related tasks such as automatic speech recognition (ASR), translation, speaker verification, and joint speech-text foundational models. In this work, we present a comprehensive analysis on building ASR systems with discrete codes. We investigate different methods for codec training such as quantization schemes and time-domain vs spectral feature encodings. We further explore ASR training techniques aimed at enhancing performance, training efficiency, and noise robustness. Drawing upon our findings, we introduce a codec ASR pipeline that outperforms Encodec at similar bit-rate. Remarkably, it also surpasses the state-of-the-art results achieved by strong self-supervised models on the 143 languages ML-SUPERB benchmark despite being smaller in size and pretrained on significantly less data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.03495v1-abstract-full').style.display = 'none'; document.getElementById('2407.03495v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at Interspeech 2024</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Proceedings of Interspeech 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.19954">arXiv:2406.19954</a> <span> [<a href="https://arxiv.org/pdf/2406.19954">pdf</a>, <a href="https://arxiv.org/format/2406.19954">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> BESTOW: Efficient and Streamable Speech Language Model with the Best of Two Worlds in GPT and T5 </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Chen%2C+Z">Zhehuai Chen</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+H">He Huang</a>, <a href="/search/eess?searchtype=author&query=Hrinchuk%2C+O">Oleksii Hrinchuk</a>, <a href="/search/eess?searchtype=author&query=Puvvada%2C+K+C">Krishna C. Puvvada</a>, <a href="/search/eess?searchtype=author&query=Koluguri%2C+N+R">Nithin Rao Koluguri</a>, <a href="/search/eess?searchtype=author&query=%C5%BBelasko%2C+P">Piotr 呕elasko</a>, <a href="/search/eess?searchtype=author&query=Balam%2C+J">Jagadeesh Balam</a>, <a href="/search/eess?searchtype=author&query=Ginsburg%2C+B">Boris Ginsburg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.19954v1-abstract-short" style="display: inline;"> Incorporating speech understanding capabilities into pretrained large-language models has become a vital research direction (SpeechLLM). The previous architectures can be categorized as: i) GPT-style, prepend speech prompts to the text prompts as a sequence of LLM inputs like a decoder-only model; ii) T5-style, introduce speech cross-attention to each layer of the pretrained LLMs. We propose BESTO… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.19954v1-abstract-full').style.display = 'inline'; document.getElementById('2406.19954v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.19954v1-abstract-full" style="display: none;"> Incorporating speech understanding capabilities into pretrained large-language models has become a vital research direction (SpeechLLM). The previous architectures can be categorized as: i) GPT-style, prepend speech prompts to the text prompts as a sequence of LLM inputs like a decoder-only model; ii) T5-style, introduce speech cross-attention to each layer of the pretrained LLMs. We propose BESTOW architecture to bring the BESt features from TwO Worlds into a single model that is highly efficient and has strong multitask capabilities. Moreover, there is no clear streaming solution for either style, especially considering the solution should generalize to speech multitask. We reformulate streamable SpeechLLM as a read-write policy problem and unifies the offline and streaming research with BESTOW architecture. Hence we demonstrate the first open-source SpeechLLM solution that enables Streaming and Multitask at scale (beyond ASR) at the same time. This streamable solution achieves very strong performance on a wide range of speech tasks (ASR, AST, SQA, unseen DynamicSuperb). It is end-to-end optimizable, with lower training/inference cost, and demonstrates LLM knowledge transferability to speech. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.19954v1-abstract-full').style.display = 'none'; document.getElementById('2406.19954v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68T10 <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.7 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.19674">arXiv:2406.19674</a> <span> [<a href="https://arxiv.org/pdf/2406.19674">pdf</a>, <a href="https://arxiv.org/format/2406.19674">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Less is More: Accurate Speech Recognition & Translation without Web-Scale Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Puvvada%2C+K+C">Krishna C. Puvvada</a>, <a href="/search/eess?searchtype=author&query=%C5%BBelasko%2C+P">Piotr 呕elasko</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+H">He Huang</a>, <a href="/search/eess?searchtype=author&query=Hrinchuk%2C+O">Oleksii Hrinchuk</a>, <a href="/search/eess?searchtype=author&query=Koluguri%2C+N+R">Nithin Rao Koluguri</a>, <a href="/search/eess?searchtype=author&query=Dhawan%2C+K">Kunal Dhawan</a>, <a href="/search/eess?searchtype=author&query=Majumdar%2C+S">Somshubra Majumdar</a>, <a href="/search/eess?searchtype=author&query=Rastorgueva%2C+E">Elena Rastorgueva</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Z">Zhehuai Chen</a>, <a href="/search/eess?searchtype=author&query=Lavrukhin%2C+V">Vitaly Lavrukhin</a>, <a href="/search/eess?searchtype=author&query=Balam%2C+J">Jagadeesh Balam</a>, <a href="/search/eess?searchtype=author&query=Ginsburg%2C+B">Boris Ginsburg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.19674v1-abstract-short" style="display: inline;"> Recent advances in speech recognition and translation rely on hundreds of thousands of hours of Internet speech data. We argue that state-of-the art accuracy can be reached without relying on web-scale data. Canary - multilingual ASR and speech translation model, outperforms current state-of-the-art models - Whisper, OWSM, and Seamless-M4T on English, French, Spanish, and German languages, while b… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.19674v1-abstract-full').style.display = 'inline'; document.getElementById('2406.19674v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.19674v1-abstract-full" style="display: none;"> Recent advances in speech recognition and translation rely on hundreds of thousands of hours of Internet speech data. We argue that state-of-the art accuracy can be reached without relying on web-scale data. Canary - multilingual ASR and speech translation model, outperforms current state-of-the-art models - Whisper, OWSM, and Seamless-M4T on English, French, Spanish, and German languages, while being trained on an order of magnitude less data than these models. Three key factors enables such data-efficient model: (1) a FastConformer-based attention encoder-decoder architecture (2) training on synthetic data generated with machine translation and (3) advanced training techniques: data-balancing, dynamic data blending, dynamic bucketing and noise-robust fine-tuning. The model, weights, and training code will be open-sourced. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.19674v1-abstract-full').style.display = 'none'; document.getElementById('2406.19674v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at Interspeech-2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.18871">arXiv:2406.18871</a> <span> [<a href="https://arxiv.org/pdf/2406.18871">pdf</a>, <a href="https://arxiv.org/format/2406.18871">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> DeSTA: Enhancing Speech Language Models through Descriptive Speech-Text Alignment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Lu%2C+K">Ke-Han Lu</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Z">Zhehuai Chen</a>, <a href="/search/eess?searchtype=author&query=Fu%2C+S">Szu-Wei Fu</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+H">He Huang</a>, <a href="/search/eess?searchtype=author&query=Ginsburg%2C+B">Boris Ginsburg</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y+F">Yu-Chiang Frank Wang</a>, <a href="/search/eess?searchtype=author&query=Lee%2C+H">Hung-yi Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.18871v1-abstract-short" style="display: inline;"> Recent speech language models (SLMs) typically incorporate pre-trained speech models to extend the capabilities from large language models (LLMs). In this paper, we propose a Descriptive Speech-Text Alignment approach that leverages speech captioning to bridge the gap between speech and text modalities, enabling SLMs to interpret and generate comprehensive natural language descriptions, thereby fa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.18871v1-abstract-full').style.display = 'inline'; document.getElementById('2406.18871v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.18871v1-abstract-full" style="display: none;"> Recent speech language models (SLMs) typically incorporate pre-trained speech models to extend the capabilities from large language models (LLMs). In this paper, we propose a Descriptive Speech-Text Alignment approach that leverages speech captioning to bridge the gap between speech and text modalities, enabling SLMs to interpret and generate comprehensive natural language descriptions, thereby facilitating the capability to understand both linguistic and non-linguistic features in speech. Enhanced with the proposed approach, our model demonstrates superior performance on the Dynamic-SUPERB benchmark, particularly in generalizing to unseen tasks. Moreover, we discover that the aligned model exhibits a zero-shot instruction-following capability without explicit speech instruction tuning. These findings highlight the potential to reshape instruction-following SLMs by incorporating rich, descriptive speech captions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.18871v1-abstract-full').style.display = 'none'; document.getElementById('2406.18871v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to Interspeech 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.17957">arXiv:2406.17957</a> <span> [<a href="https://arxiv.org/pdf/2406.17957">pdf</a>, <a href="https://arxiv.org/format/2406.17957">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Improving Robustness of LLM-based Speech Synthesis by Learning Monotonic Alignment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Neekhara%2C+P">Paarth Neekhara</a>, <a href="/search/eess?searchtype=author&query=Hussain%2C+S">Shehzeen Hussain</a>, <a href="/search/eess?searchtype=author&query=Ghosh%2C+S">Subhankar Ghosh</a>, <a href="/search/eess?searchtype=author&query=Li%2C+J">Jason Li</a>, <a href="/search/eess?searchtype=author&query=Valle%2C+R">Rafael Valle</a>, <a href="/search/eess?searchtype=author&query=Badlani%2C+R">Rohan Badlani</a>, <a href="/search/eess?searchtype=author&query=Ginsburg%2C+B">Boris Ginsburg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.17957v1-abstract-short" style="display: inline;"> Large Language Model (LLM) based text-to-speech (TTS) systems have demonstrated remarkable capabilities in handling large speech datasets and generating natural speech for new speakers. However, LLM-based TTS models are not robust as the generated output can contain repeating words, missing words and mis-aligned speech (referred to as hallucinations or attention errors), especially when the text c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.17957v1-abstract-full').style.display = 'inline'; document.getElementById('2406.17957v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.17957v1-abstract-full" style="display: none;"> Large Language Model (LLM) based text-to-speech (TTS) systems have demonstrated remarkable capabilities in handling large speech datasets and generating natural speech for new speakers. However, LLM-based TTS models are not robust as the generated output can contain repeating words, missing words and mis-aligned speech (referred to as hallucinations or attention errors), especially when the text contains multiple occurrences of the same token. We examine these challenges in an encoder-decoder transformer model and find that certain cross-attention heads in such models implicitly learn the text and speech alignment when trained for predicting speech tokens for a given text. To make the alignment more robust, we propose techniques utilizing CTC loss and attention priors that encourage monotonic cross-attention over the text tokens. Our guided attention training technique does not introduce any new learnable parameters and significantly improves robustness of LLM-based TTS models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.17957v1-abstract-full').style.display = 'none'; document.getElementById('2406.17957v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published as a conference paper at INTERSPEECH 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.12946">arXiv:2406.12946</a> <span> [<a href="https://arxiv.org/pdf/2406.12946">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Instruction Data Generation and Unsupervised Adaptation for Speech Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Noroozi%2C+V">Vahid Noroozi</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Z">Zhehuai Chen</a>, <a href="/search/eess?searchtype=author&query=Majumdar%2C+S">Somshubra Majumdar</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+S">Steve Huang</a>, <a href="/search/eess?searchtype=author&query=Balam%2C+J">Jagadeesh Balam</a>, <a href="/search/eess?searchtype=author&query=Ginsburg%2C+B">Boris Ginsburg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.12946v1-abstract-short" style="display: inline;"> In this paper, we propose three methods for generating synthetic samples to train and evaluate multimodal large language models capable of processing both text and speech inputs. Addressing the scarcity of samples containing both modalities, synthetic data generation emerges as a crucial strategy to enhance the performance of such systems and facilitate the modeling of cross-modal relationships be… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.12946v1-abstract-full').style.display = 'inline'; document.getElementById('2406.12946v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.12946v1-abstract-full" style="display: none;"> In this paper, we propose three methods for generating synthetic samples to train and evaluate multimodal large language models capable of processing both text and speech inputs. Addressing the scarcity of samples containing both modalities, synthetic data generation emerges as a crucial strategy to enhance the performance of such systems and facilitate the modeling of cross-modal relationships between the speech and text domains. Our process employs large language models to generate textual components and text-to-speech systems to generate speech components. The proposed methods offer a practical and effective means to expand the training dataset for these models. Experimental results show progress in achieving an integrated understanding of text and speech. We also highlight the potential of using unlabeled speech data to generate synthetic samples comparable in quality to those with available transcriptions, enabling the expansion of these models to more languages. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.12946v1-abstract-full').style.display = 'none'; document.getElementById('2406.12946v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted for Interspeech 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.07096">arXiv:2406.07096</a> <span> [<a href="https://arxiv.org/pdf/2406.07096">pdf</a>, <a href="https://arxiv.org/format/2406.07096">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Fast Context-Biasing for CTC and Transducer ASR models with CTC-based Word Spotter </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Andrusenko%2C+A">Andrei Andrusenko</a>, <a href="/search/eess?searchtype=author&query=Laptev%2C+A">Aleksandr Laptev</a>, <a href="/search/eess?searchtype=author&query=Bataev%2C+V">Vladimir Bataev</a>, <a href="/search/eess?searchtype=author&query=Lavrukhin%2C+V">Vitaly Lavrukhin</a>, <a href="/search/eess?searchtype=author&query=Ginsburg%2C+B">Boris Ginsburg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.07096v1-abstract-short" style="display: inline;"> Accurate recognition of rare and new words remains a pressing problem for contextualized Automatic Speech Recognition (ASR) systems. Most context-biasing methods involve modification of the ASR model or the beam-search decoding algorithm, complicating model reuse and slowing down inference. This work presents a new approach to fast context-biasing with CTC-based Word Spotter (CTC-WS) for CTC and T… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.07096v1-abstract-full').style.display = 'inline'; document.getElementById('2406.07096v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.07096v1-abstract-full" style="display: none;"> Accurate recognition of rare and new words remains a pressing problem for contextualized Automatic Speech Recognition (ASR) systems. Most context-biasing methods involve modification of the ASR model or the beam-search decoding algorithm, complicating model reuse and slowing down inference. This work presents a new approach to fast context-biasing with CTC-based Word Spotter (CTC-WS) for CTC and Transducer (RNN-T) ASR models. The proposed method matches CTC log-probabilities against a compact context graph to detect potential context-biasing candidates. The valid candidates then replace their greedy recognition counterparts in corresponding frame intervals. A Hybrid Transducer-CTC model enables the CTC-WS application for the Transducer model. The results demonstrate a significant acceleration of the context-biasing recognition with a simultaneous improvement in F-score and WER compared to baseline methods. The proposed method is publicly available in the NVIDIA NeMo toolkit. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.07096v1-abstract-full').style.display = 'none'; document.getElementById('2406.07096v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by Interspeech 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.06220">arXiv:2406.06220</a> <span> [<a href="https://arxiv.org/pdf/2406.06220">pdf</a>, <a href="https://arxiv.org/format/2406.06220">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/SLT61566.2024.10832333">10.1109/SLT61566.2024.10832333 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Label-Looping: Highly Efficient Decoding for Transducers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Bataev%2C+V">Vladimir Bataev</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+H">Hainan Xu</a>, <a href="/search/eess?searchtype=author&query=Galvez%2C+D">Daniel Galvez</a>, <a href="/search/eess?searchtype=author&query=Lavrukhin%2C+V">Vitaly Lavrukhin</a>, <a href="/search/eess?searchtype=author&query=Ginsburg%2C+B">Boris Ginsburg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.06220v2-abstract-short" style="display: inline;"> This paper introduces a highly efficient greedy decoding algorithm for Transducer-based speech recognition models. We redesign the standard nested-loop design for RNN-T decoding, swapping loops over frames and labels: the outer loop iterates over labels, while the inner loop iterates over frames searching for the next non-blank symbol. Additionally, we represent partial hypotheses in a special str… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.06220v2-abstract-full').style.display = 'inline'; document.getElementById('2406.06220v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.06220v2-abstract-full" style="display: none;"> This paper introduces a highly efficient greedy decoding algorithm for Transducer-based speech recognition models. We redesign the standard nested-loop design for RNN-T decoding, swapping loops over frames and labels: the outer loop iterates over labels, while the inner loop iterates over frames searching for the next non-blank symbol. Additionally, we represent partial hypotheses in a special structure using CUDA tensors, supporting parallelized hypotheses manipulations. Experiments show that the label-looping algorithm is up to 2.0X faster than conventional batched decoding when using batch size 32. It can be further combined with other compiler or GPU call-related techniques to achieve even more speedup. Our algorithm is general-purpose and can work with both conventional Transducers and Token-and-Duration Transducers. We open-source our implementation to benefit the research community. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.06220v2-abstract-full').style.display = 'none'; document.getElementById('2406.06220v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at IEEE SLT 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.05298">arXiv:2406.05298</a> <span> [<a href="https://arxiv.org/pdf/2406.05298">pdf</a>, <a href="https://arxiv.org/format/2406.05298">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Spectral Codecs: Spectrogram-Based Audio Codecs for High Quality Speech Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Langman%2C+R">Ryan Langman</a>, <a href="/search/eess?searchtype=author&query=Juki%C4%87%2C+A">Ante Juki膰</a>, <a href="/search/eess?searchtype=author&query=Dhawan%2C+K">Kunal Dhawan</a>, <a href="/search/eess?searchtype=author&query=Koluguri%2C+N+R">Nithin Rao Koluguri</a>, <a href="/search/eess?searchtype=author&query=Ginsburg%2C+B">Boris Ginsburg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.05298v1-abstract-short" style="display: inline;"> Historically, most speech models in machine-learning have used the mel-spectrogram as a speech representation. Recently, discrete audio tokens produced by neural audio codecs have become a popular alternate speech representation for speech synthesis tasks such as text-to-speech (TTS). However, the data distribution produced by such codecs is too complex for some TTS models to predict, hence requir… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.05298v1-abstract-full').style.display = 'inline'; document.getElementById('2406.05298v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.05298v1-abstract-full" style="display: none;"> Historically, most speech models in machine-learning have used the mel-spectrogram as a speech representation. Recently, discrete audio tokens produced by neural audio codecs have become a popular alternate speech representation for speech synthesis tasks such as text-to-speech (TTS). However, the data distribution produced by such codecs is too complex for some TTS models to predict, hence requiring large autoregressive models to get reasonable quality. Typical audio codecs compress and reconstruct the time-domain audio signal. We propose a spectral codec which compresses the mel-spectrogram and reconstructs the time-domain audio signal. A study of objective audio quality metrics suggests that our spectral codec has comparable perceptual quality to equivalent audio codecs. Furthermore, non-autoregressive TTS models trained with the proposed spectral codec generate audio with significantly higher quality than when trained with mel-spectrograms or audio codecs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.05298v1-abstract-full').style.display = 'none'; document.getElementById('2406.05298v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.04552">arXiv:2406.04552</a> <span> [<a href="https://arxiv.org/pdf/2406.04552">pdf</a>, <a href="https://arxiv.org/ps/2406.04552">ps</a>, <a href="https://arxiv.org/format/2406.04552">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Flexible Multichannel Speech Enhancement for Noise-Robust Frontend </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Juki%C4%87%2C+A">Ante Juki膰</a>, <a href="/search/eess?searchtype=author&query=Balam%2C+J">Jagadeesh Balam</a>, <a href="/search/eess?searchtype=author&query=Ginsburg%2C+B">Boris Ginsburg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.04552v1-abstract-short" style="display: inline;"> This paper proposes a flexible multichannel speech enhancement system with the main goal of improving robustness of automatic speech recognition (ASR) in noisy conditions. The proposed system combines a flexible neural mask estimator applicable to different channel counts and configurations and a multichannel filter with automatic reference selection. A transform-attend-concatenate layer is propos… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.04552v1-abstract-full').style.display = 'inline'; document.getElementById('2406.04552v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.04552v1-abstract-full" style="display: none;"> This paper proposes a flexible multichannel speech enhancement system with the main goal of improving robustness of automatic speech recognition (ASR) in noisy conditions. The proposed system combines a flexible neural mask estimator applicable to different channel counts and configurations and a multichannel filter with automatic reference selection. A transform-attend-concatenate layer is proposed to handle cross-channel information in the mask estimator, which is shown to be effective for arbitrary microphone configurations. The presented evaluation demonstrates the effectiveness of the flexible system for several seen and unseen compact array geometries, matching the performance of fixed configuration-specific systems. Furthermore, a significantly improved ASR performance is observed for configurations with randomly-placed microphones. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.04552v1-abstract-full').style.display = 'none'; document.getElementById('2406.04552v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> WASPAA 2023 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.12983">arXiv:2405.12983</a> <span> [<a href="https://arxiv.org/pdf/2405.12983">pdf</a>, <a href="https://arxiv.org/format/2405.12983">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Multilingual Audio-Visual Speech Recognition with Hybrid CTC/RNN-T Fast Conformer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Burchi%2C+M">Maxime Burchi</a>, <a href="/search/eess?searchtype=author&query=Puvvada%2C+K+C">Krishna C. Puvvada</a>, <a href="/search/eess?searchtype=author&query=Balam%2C+J">Jagadeesh Balam</a>, <a href="/search/eess?searchtype=author&query=Ginsburg%2C+B">Boris Ginsburg</a>, <a href="/search/eess?searchtype=author&query=Timofte%2C+R">Radu Timofte</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.12983v1-abstract-short" style="display: inline;"> Humans are adept at leveraging visual cues from lip movements for recognizing speech in adverse listening conditions. Audio-Visual Speech Recognition (AVSR) models follow similar approach to achieve robust speech recognition in noisy conditions. In this work, we present a multilingual AVSR model incorporating several enhancements to improve performance and audio noise robustness. Notably, we adapt… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.12983v1-abstract-full').style.display = 'inline'; document.getElementById('2405.12983v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.12983v1-abstract-full" style="display: none;"> Humans are adept at leveraging visual cues from lip movements for recognizing speech in adverse listening conditions. Audio-Visual Speech Recognition (AVSR) models follow similar approach to achieve robust speech recognition in noisy conditions. In this work, we present a multilingual AVSR model incorporating several enhancements to improve performance and audio noise robustness. Notably, we adapt the recently proposed Fast Conformer model to process both audio and visual modalities using a novel hybrid CTC/RNN-T architecture. We increase the amount of audio-visual training data for six distinct languages, generating automatic transcriptions of unlabelled multilingual datasets (VoxCeleb2 and AVSpeech). Our proposed model achieves new state-of-the-art performance on the LRS3 dataset, reaching WER of 0.8%. On the recently introduced MuAViC benchmark, our model yields an absolute average-WER reduction of 11.9% in comparison to the original baseline. Finally, we demonstrate the ability of the proposed model to perform audio-only, visual-only, and audio-visual speech recognition at test time. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.12983v1-abstract-full').style.display = 'none'; document.getElementById('2405.12983v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.04295">arXiv:2404.04295</a> <span> [<a href="https://arxiv.org/pdf/2404.04295">pdf</a>, <a href="https://arxiv.org/format/2404.04295">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Transducers with Pronunciation-aware Embeddings for Automatic Speech Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Xu%2C+H">Hainan Xu</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Z">Zhehuai Chen</a>, <a href="/search/eess?searchtype=author&query=Jia%2C+F">Fei Jia</a>, <a href="/search/eess?searchtype=author&query=Ginsburg%2C+B">Boris Ginsburg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.04295v1-abstract-short" style="display: inline;"> This paper proposes Transducers with Pronunciation-aware Embeddings (PET). Unlike conventional Transducers where the decoder embeddings for different tokens are trained independently, the PET model's decoder embedding incorporates shared components for text tokens with the same or similar pronunciations. With experiments conducted in multiple datasets in Mandarin Chinese and Korean, we show that P… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.04295v1-abstract-full').style.display = 'inline'; document.getElementById('2404.04295v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.04295v1-abstract-full" style="display: none;"> This paper proposes Transducers with Pronunciation-aware Embeddings (PET). Unlike conventional Transducers where the decoder embeddings for different tokens are trained independently, the PET model's decoder embedding incorporates shared components for text tokens with the same or similar pronunciations. With experiments conducted in multiple datasets in Mandarin Chinese and Korean, we show that PET models consistently improve speech recognition accuracy compared to conventional Transducers. Our investigation also uncovers a phenomenon that we call error chain reactions. Instead of recognition errors being evenly spread throughout an utterance, they tend to group together, with subsequent errors often following earlier ones. Our analysis shows that PET models effectively mitigate this issue by substantially reducing the likelihood of the model generating additional errors following a prior one. Our implementation will be open-sourced with the NeMo toolkit. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.04295v1-abstract-full').style.display = 'none'; document.getElementById('2404.04295v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted at the ICASSP 2024 conference</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.17279">arXiv:2312.17279</a> <span> [<a href="https://arxiv.org/pdf/2312.17279">pdf</a>, <a href="https://arxiv.org/format/2312.17279">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Stateful Conformer with Cache-based Inference for Streaming Automatic Speech Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Noroozi%2C+V">Vahid Noroozi</a>, <a href="/search/eess?searchtype=author&query=Majumdar%2C+S">Somshubra Majumdar</a>, <a href="/search/eess?searchtype=author&query=Kumar%2C+A">Ankur Kumar</a>, <a href="/search/eess?searchtype=author&query=Balam%2C+J">Jagadeesh Balam</a>, <a href="/search/eess?searchtype=author&query=Ginsburg%2C+B">Boris Ginsburg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.17279v3-abstract-short" style="display: inline;"> In this paper, we propose an efficient and accurate streaming speech recognition model based on the FastConformer architecture. We adapted the FastConformer architecture for streaming applications through: (1) constraining both the look-ahead and past contexts in the encoder, and (2) introducing an activation caching mechanism to enable the non-autoregressive encoder to operate autoregressively du… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.17279v3-abstract-full').style.display = 'inline'; document.getElementById('2312.17279v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.17279v3-abstract-full" style="display: none;"> In this paper, we propose an efficient and accurate streaming speech recognition model based on the FastConformer architecture. We adapted the FastConformer architecture for streaming applications through: (1) constraining both the look-ahead and past contexts in the encoder, and (2) introducing an activation caching mechanism to enable the non-autoregressive encoder to operate autoregressively during inference. The proposed model is thoughtfully designed in a way to eliminate the accuracy disparity between the train and inference time which is common for many streaming models. Furthermore, our proposed encoder works with various decoder configurations including Connectionist Temporal Classification (CTC) and RNN-Transducer (RNNT) decoders. Additionally, we introduced a hybrid CTC/RNNT architecture which utilizes a shared encoder with both a CTC and RNNT decoder to boost the accuracy and save computation. We evaluate the proposed model on LibriSpeech dataset and a multi-domain large scale dataset and demonstrate that it can achieve better accuracy with lower latency and inference time compared to a conventional buffered streaming model baseline. We also showed that training a model with multiple latencies can achieve better accuracy than single latency models while it enables us to support multiple latencies with a single model. Our experiments also showed the hybrid architecture would not only speedup the convergence of the CTC decoder but also improves the accuracy of streaming models compared to single decoder models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.17279v3-abstract-full').style.display = 'none'; document.getElementById('2312.17279v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Shorter version accepted to ICASSP 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.12378">arXiv:2310.12378</a> <span> [<a href="https://arxiv.org/pdf/2310.12378">pdf</a>, <a href="https://arxiv.org/format/2310.12378">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> The CHiME-7 Challenge: System Description and Performance of NeMo Team's DASR System </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Park%2C+T+J">Tae Jin Park</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+H">He Huang</a>, <a href="/search/eess?searchtype=author&query=Jukic%2C+A">Ante Jukic</a>, <a href="/search/eess?searchtype=author&query=Dhawan%2C+K">Kunal Dhawan</a>, <a href="/search/eess?searchtype=author&query=Puvvada%2C+K+C">Krishna C. Puvvada</a>, <a href="/search/eess?searchtype=author&query=Koluguri%2C+N">Nithin Koluguri</a>, <a href="/search/eess?searchtype=author&query=Karpov%2C+N">Nikolay Karpov</a>, <a href="/search/eess?searchtype=author&query=Laptev%2C+A">Aleksandr Laptev</a>, <a href="/search/eess?searchtype=author&query=Balam%2C+J">Jagadeesh Balam</a>, <a href="/search/eess?searchtype=author&query=Ginsburg%2C+B">Boris Ginsburg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.12378v1-abstract-short" style="display: inline;"> We present the NVIDIA NeMo team's multi-channel speech recognition system for the 7th CHiME Challenge Distant Automatic Speech Recognition (DASR) Task, focusing on the development of a multi-channel, multi-speaker speech recognition system tailored to transcribe speech from distributed microphones and microphone arrays. The system predominantly comprises of the following integral modules: the Spea… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.12378v1-abstract-full').style.display = 'inline'; document.getElementById('2310.12378v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.12378v1-abstract-full" style="display: none;"> We present the NVIDIA NeMo team's multi-channel speech recognition system for the 7th CHiME Challenge Distant Automatic Speech Recognition (DASR) Task, focusing on the development of a multi-channel, multi-speaker speech recognition system tailored to transcribe speech from distributed microphones and microphone arrays. The system predominantly comprises of the following integral modules: the Speaker Diarization Module, Multi-channel Audio Front-End Processing Module, and the ASR Module. These components collectively establish a cascading system, meticulously processing multi-channel and multi-speaker audio input. Moreover, this paper highlights the comprehensive optimization process that significantly enhanced our system's performance. Our team's submission is largely based on NeMo toolkits and will be publicly available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.12378v1-abstract-full').style.display = 'none'; document.getElementById('2310.12378v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> CHiME-7 Workshop 2023 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.12371">arXiv:2310.12371</a> <span> [<a href="https://arxiv.org/pdf/2310.12371">pdf</a>, <a href="https://arxiv.org/format/2310.12371">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Property-Aware Multi-Speaker Data Simulation: A Probabilistic Modelling Technique for Synthetic Data Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Park%2C+T+J">Tae Jin Park</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+H">He Huang</a>, <a href="/search/eess?searchtype=author&query=Hooper%2C+C">Coleman Hooper</a>, <a href="/search/eess?searchtype=author&query=Koluguri%2C+N">Nithin Koluguri</a>, <a href="/search/eess?searchtype=author&query=Dhawan%2C+K">Kunal Dhawan</a>, <a href="/search/eess?searchtype=author&query=Jukic%2C+A">Ante Jukic</a>, <a href="/search/eess?searchtype=author&query=Balam%2C+J">Jagadeesh Balam</a>, <a href="/search/eess?searchtype=author&query=Ginsburg%2C+B">Boris Ginsburg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.12371v1-abstract-short" style="display: inline;"> We introduce a sophisticated multi-speaker speech data simulator, specifically engineered to generate multi-speaker speech recordings. A notable feature of this simulator is its capacity to modulate the distribution of silence and overlap via the adjustment of statistical parameters. This capability offers a tailored training environment for developing neural models suited for speaker diarization… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.12371v1-abstract-full').style.display = 'inline'; document.getElementById('2310.12371v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.12371v1-abstract-full" style="display: none;"> We introduce a sophisticated multi-speaker speech data simulator, specifically engineered to generate multi-speaker speech recordings. A notable feature of this simulator is its capacity to modulate the distribution of silence and overlap via the adjustment of statistical parameters. This capability offers a tailored training environment for developing neural models suited for speaker diarization and voice activity detection. The acquisition of substantial datasets for speaker diarization often presents a significant challenge, particularly in multi-speaker scenarios. Furthermore, the precise time stamp annotation of speech data is a critical factor for training both speaker diarization and voice activity detection. Our proposed multi-speaker simulator tackles these problems by generating large-scale audio mixtures that maintain statistical properties closely aligned with the input parameters. We demonstrate that the proposed multi-speaker simulator generates audio mixtures with statistical properties that closely align with the input parameters derived from real-world statistics. Additionally, we present the effectiveness of speaker diarization and voice activity detection models, which have been trained exclusively on the generated simulated datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.12371v1-abstract-full').style.display = 'none'; document.getElementById('2310.12371v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> CHiME-7 Workshop 2023 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.09653">arXiv:2310.09653</a> <span> [<a href="https://arxiv.org/pdf/2310.09653">pdf</a>, <a href="https://arxiv.org/format/2310.09653">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> SelfVC: Voice Conversion With Iterative Refinement using Self Transformations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Neekhara%2C+P">Paarth Neekhara</a>, <a href="/search/eess?searchtype=author&query=Hussain%2C+S">Shehzeen Hussain</a>, <a href="/search/eess?searchtype=author&query=Valle%2C+R">Rafael Valle</a>, <a href="/search/eess?searchtype=author&query=Ginsburg%2C+B">Boris Ginsburg</a>, <a href="/search/eess?searchtype=author&query=Ranjan%2C+R">Rishabh Ranjan</a>, <a href="/search/eess?searchtype=author&query=Dubnov%2C+S">Shlomo Dubnov</a>, <a href="/search/eess?searchtype=author&query=Koushanfar%2C+F">Farinaz Koushanfar</a>, <a href="/search/eess?searchtype=author&query=McAuley%2C+J">Julian McAuley</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.09653v2-abstract-short" style="display: inline;"> We propose SelfVC, a training strategy to iteratively improve a voice conversion model with self-synthesized examples. Previous efforts on voice conversion focus on factorizing speech into explicitly disentangled representations that separately encode speaker characteristics and linguistic content. However, disentangling speech representations to capture such attributes using task-specific loss te… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.09653v2-abstract-full').style.display = 'inline'; document.getElementById('2310.09653v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.09653v2-abstract-full" style="display: none;"> We propose SelfVC, a training strategy to iteratively improve a voice conversion model with self-synthesized examples. Previous efforts on voice conversion focus on factorizing speech into explicitly disentangled representations that separately encode speaker characteristics and linguistic content. However, disentangling speech representations to capture such attributes using task-specific loss terms can lead to information loss. In this work, instead of explicitly disentangling attributes with loss terms, we present a framework to train a controllable voice conversion model on entangled speech representations derived from self-supervised learning (SSL) and speaker verification models. First, we develop techniques to derive prosodic information from the audio signal and SSL representations to train predictive submodules in the synthesis model. Next, we propose a training strategy to iteratively improve the synthesis model for voice conversion, by creating a challenging training objective using self-synthesized examples. We demonstrate that incorporating such self-synthesized examples during training improves the speaker similarity of generated speech as compared to a baseline voice conversion model trained solely on heuristically perturbed inputs. Our framework is trained without any text and achieves state-of-the-art results in zero-shot voice conversion on metrics evaluating naturalness, speaker similarity, and intelligibility of synthesized audio. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.09653v2-abstract-full').style.display = 'none'; document.getElementById('2310.09653v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at ICML 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.09424">arXiv:2310.09424</a> <span> [<a href="https://arxiv.org/pdf/2310.09424">pdf</a>, <a href="https://arxiv.org/format/2310.09424">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> SALM: Speech-augmented Language Model with In-context Learning for Speech Recognition and Translation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Chen%2C+Z">Zhehuai Chen</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+H">He Huang</a>, <a href="/search/eess?searchtype=author&query=Andrusenko%2C+A">Andrei Andrusenko</a>, <a href="/search/eess?searchtype=author&query=Hrinchuk%2C+O">Oleksii Hrinchuk</a>, <a href="/search/eess?searchtype=author&query=Puvvada%2C+K+C">Krishna C. Puvvada</a>, <a href="/search/eess?searchtype=author&query=Li%2C+J">Jason Li</a>, <a href="/search/eess?searchtype=author&query=Ghosh%2C+S">Subhankar Ghosh</a>, <a href="/search/eess?searchtype=author&query=Balam%2C+J">Jagadeesh Balam</a>, <a href="/search/eess?searchtype=author&query=Ginsburg%2C+B">Boris Ginsburg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.09424v1-abstract-short" style="display: inline;"> We present a novel Speech Augmented Language Model (SALM) with {\em multitask} and {\em in-context} learning capabilities. SALM comprises a frozen text LLM, a audio encoder, a modality adapter module, and LoRA layers to accommodate speech input and associated task instructions. The unified SALM not only achieves performance on par with task-specific Conformer baselines for Automatic Speech Recogni… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.09424v1-abstract-full').style.display = 'inline'; document.getElementById('2310.09424v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.09424v1-abstract-full" style="display: none;"> We present a novel Speech Augmented Language Model (SALM) with {\em multitask} and {\em in-context} learning capabilities. SALM comprises a frozen text LLM, a audio encoder, a modality adapter module, and LoRA layers to accommodate speech input and associated task instructions. The unified SALM not only achieves performance on par with task-specific Conformer baselines for Automatic Speech Recognition (ASR) and Speech Translation (AST), but also exhibits zero-shot in-context learning capabilities, demonstrated through keyword-boosting task for ASR and AST. Moreover, {\em speech supervised in-context training} is proposed to bridge the gap between LLM training and downstream speech tasks, which further boosts the in-context learning ability of speech-to-text models. Proposed model is open-sourced via NeMo toolkit. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.09424v1-abstract-full').style.display = 'none'; document.getElementById('2310.09424v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">submit to ICASSP 2024</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68T10 <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.7 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.10922">arXiv:2309.10922</a> <span> [<a href="https://arxiv.org/pdf/2309.10922">pdf</a>, <a href="https://arxiv.org/format/2309.10922">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Discrete Audio Representation as an Alternative to Mel-Spectrograms for Speaker and Speech Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Puvvada%2C+K+C">Krishna C. Puvvada</a>, <a href="/search/eess?searchtype=author&query=Koluguri%2C+N+R">Nithin Rao Koluguri</a>, <a href="/search/eess?searchtype=author&query=Dhawan%2C+K">Kunal Dhawan</a>, <a href="/search/eess?searchtype=author&query=Balam%2C+J">Jagadeesh Balam</a>, <a href="/search/eess?searchtype=author&query=Ginsburg%2C+B">Boris Ginsburg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.10922v1-abstract-short" style="display: inline;"> Discrete audio representation, aka audio tokenization, has seen renewed interest driven by its potential to facilitate the application of text language modeling approaches in audio domain. To this end, various compression and representation-learning based tokenization schemes have been proposed. However, there is limited investigation into the performance of compression-based audio tokens compared… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.10922v1-abstract-full').style.display = 'inline'; document.getElementById('2309.10922v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.10922v1-abstract-full" style="display: none;"> Discrete audio representation, aka audio tokenization, has seen renewed interest driven by its potential to facilitate the application of text language modeling approaches in audio domain. To this end, various compression and representation-learning based tokenization schemes have been proposed. However, there is limited investigation into the performance of compression-based audio tokens compared to well-established mel-spectrogram features across various speaker and speech related tasks. In this paper, we evaluate compression based audio tokens on three tasks: Speaker Verification, Diarization and (Multi-lingual) Speech Recognition. Our findings indicate that (i) the models trained on audio tokens perform competitively, on average within $1\%$ of mel-spectrogram features for all the tasks considered, and do not surpass them yet. (ii) these models exhibit robustness for out-of-domain narrowband data, particularly in speaker tasks. (iii) audio tokens allow for compression to 20x compared to mel-spectrogram features with minimal loss of performance in speech and speaker related tasks, which is crucial for low bit-rate applications, and (iv) the examined Residual Vector Quantization (RVQ) based audio tokenizer exhibits a low-pass frequency response characteristic, offering a plausible explanation for the observed results, and providing insight for future tokenizer designs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.10922v1-abstract-full').style.display = 'none'; document.getElementById('2309.10922v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Preprint. Submitted to ICASSP 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.09950">arXiv:2309.09950</a> <span> [<a href="https://arxiv.org/pdf/2309.09950">pdf</a>, <a href="https://arxiv.org/format/2309.09950">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Investigating End-to-End ASR Architectures for Long Form Audio Transcription </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Koluguri%2C+N+R">Nithin Rao Koluguri</a>, <a href="/search/eess?searchtype=author&query=Kriman%2C+S">Samuel Kriman</a>, <a href="/search/eess?searchtype=author&query=Zelenfroind%2C+G">Georgy Zelenfroind</a>, <a href="/search/eess?searchtype=author&query=Majumdar%2C+S">Somshubra Majumdar</a>, <a href="/search/eess?searchtype=author&query=Rekesh%2C+D">Dima Rekesh</a>, <a href="/search/eess?searchtype=author&query=Noroozi%2C+V">Vahid Noroozi</a>, <a href="/search/eess?searchtype=author&query=Balam%2C+J">Jagadeesh Balam</a>, <a href="/search/eess?searchtype=author&query=Ginsburg%2C+B">Boris Ginsburg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.09950v2-abstract-short" style="display: inline;"> This paper presents an overview and evaluation of some of the end-to-end ASR models on long-form audios. We study three categories of Automatic Speech Recognition(ASR) models based on their core architecture: (1) convolutional, (2) convolutional with squeeze-and-excitation and (3) convolutional models with attention. We selected one ASR model from each category and evaluated Word Error Rate, maxim… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.09950v2-abstract-full').style.display = 'inline'; document.getElementById('2309.09950v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.09950v2-abstract-full" style="display: none;"> This paper presents an overview and evaluation of some of the end-to-end ASR models on long-form audios. We study three categories of Automatic Speech Recognition(ASR) models based on their core architecture: (1) convolutional, (2) convolutional with squeeze-and-excitation and (3) convolutional models with attention. We selected one ASR model from each category and evaluated Word Error Rate, maximum audio length and real-time factor for each model on a variety of long audio benchmarks: Earnings-21 and 22, CORAAL, and TED-LIUM3. The model from the category of self-attention with local attention and global token has the best accuracy comparing to other architectures. We also compared models with CTC and RNNT decoders and showed that CTC-based models are more robust and efficient than RNNT on long form audio. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.09950v2-abstract-full').style.display = 'none'; document.getElementById('2309.09950v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">PrePrint. Submitted to ICASSP 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.05218">arXiv:2308.05218</a> <span> [<a href="https://arxiv.org/pdf/2308.05218">pdf</a>, <a href="https://arxiv.org/format/2308.05218">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/ICASSP49357.2023.10095115">10.1109/ICASSP49357.2023.10095115 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Conformer-based Target-Speaker Automatic Speech Recognition for Single-Channel Audio </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+Y">Yang Zhang</a>, <a href="/search/eess?searchtype=author&query=Puvvada%2C+K+C">Krishna C. Puvvada</a>, <a href="/search/eess?searchtype=author&query=Lavrukhin%2C+V">Vitaly Lavrukhin</a>, <a href="/search/eess?searchtype=author&query=Ginsburg%2C+B">Boris Ginsburg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.05218v1-abstract-short" style="display: inline;"> We propose CONF-TSASR, a non-autoregressive end-to-end time-frequency domain architecture for single-channel target-speaker automatic speech recognition (TS-ASR). The model consists of a TitaNet based speaker embedding module, a Conformer based masking as well as ASR modules. These modules are jointly optimized to transcribe a target-speaker, while ignoring speech from other speakers. For training… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.05218v1-abstract-full').style.display = 'inline'; document.getElementById('2308.05218v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.05218v1-abstract-full" style="display: none;"> We propose CONF-TSASR, a non-autoregressive end-to-end time-frequency domain architecture for single-channel target-speaker automatic speech recognition (TS-ASR). The model consists of a TitaNet based speaker embedding module, a Conformer based masking as well as ASR modules. These modules are jointly optimized to transcribe a target-speaker, while ignoring speech from other speakers. For training we use Connectionist Temporal Classification (CTC) loss and introduce a scale-invariant spectrogram reconstruction loss to encourage the model better separate the target-speaker's spectrogram from mixture. We obtain state-of-the-art target-speaker word error rate (TS-WER) on WSJ0-2mix-extr (4.2%). Further, we report for the first time TS-WER on WSJ0-3mix-extr (12.4%), LibriSpeech2Mix (4.2%) and LibriSpeech3Mix (7.6%) datasets, establishing new benchmarks for TS-ASR. The proposed model will be open-sourced through NVIDIA NeMo toolkit. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.05218v1-abstract-full').style.display = 'none'; document.getElementById('2308.05218v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2307.07057">arXiv:2307.07057</a> <span> [<a href="https://arxiv.org/pdf/2307.07057">pdf</a>, <a href="https://arxiv.org/format/2307.07057">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Leveraging Pretrained ASR Encoders for Effective and Efficient End-to-End Speech Intent Classification and Slot Filling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Huang%2C+H">He Huang</a>, <a href="/search/eess?searchtype=author&query=Balam%2C+J">Jagadeesh Balam</a>, <a href="/search/eess?searchtype=author&query=Ginsburg%2C+B">Boris Ginsburg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2307.07057v1-abstract-short" style="display: inline;"> We study speech intent classification and slot filling (SICSF) by proposing to use an encoder pretrained on speech recognition (ASR) to initialize an end-to-end (E2E) Conformer-Transformer model, which achieves the new state-of-the-art results on the SLURP dataset, with 90.14% intent accuracy and 82.27% SLURP-F1. We compare our model with encoders pretrained on self-supervised learning (SSL), and… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.07057v1-abstract-full').style.display = 'inline'; document.getElementById('2307.07057v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2307.07057v1-abstract-full" style="display: none;"> We study speech intent classification and slot filling (SICSF) by proposing to use an encoder pretrained on speech recognition (ASR) to initialize an end-to-end (E2E) Conformer-Transformer model, which achieves the new state-of-the-art results on the SLURP dataset, with 90.14% intent accuracy and 82.27% SLURP-F1. We compare our model with encoders pretrained on self-supervised learning (SSL), and show that ASR pretraining is much more effective than SSL for SICSF. To explore parameter efficiency, we freeze the encoder and add Adapter modules, and show that parameter efficiency is only achievable with an ASR-pretrained encoder, while the SSL encoder needs full finetuning to achieve comparable results. In addition, we provide an in-depth comparison on end-to-end models versus cascading models (ASR+NLU), and show that E2E models are better than cascaded models unless an oracle ASR model is provided. Last but not least, our model is the first E2E model that achieves the same performance as cascading models with oracle ASR. Code, checkpoints and configs are available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.07057v1-abstract-full').style.display = 'none'; document.getElementById('2307.07057v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">INTERSPEECH 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.15824">arXiv:2306.15824</a> <span> [<a href="https://arxiv.org/pdf/2306.15824">pdf</a>, <a href="https://arxiv.org/format/2306.15824">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.21437/Interspeech.2023-1281">10.21437/Interspeech.2023-1281 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Confidence-based Ensembles of End-to-End Speech Recognition Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Gitman%2C+I">Igor Gitman</a>, <a href="/search/eess?searchtype=author&query=Lavrukhin%2C+V">Vitaly Lavrukhin</a>, <a href="/search/eess?searchtype=author&query=Laptev%2C+A">Aleksandr Laptev</a>, <a href="/search/eess?searchtype=author&query=Ginsburg%2C+B">Boris Ginsburg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.15824v1-abstract-short" style="display: inline;"> The number of end-to-end speech recognition models grows every year. These models are often adapted to new domains or languages resulting in a proliferation of expert systems that achieve great results on target data, while generally showing inferior performance outside of their domain of expertise. We explore combination of such experts via confidence-based ensembles: ensembles of models where on… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.15824v1-abstract-full').style.display = 'inline'; document.getElementById('2306.15824v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.15824v1-abstract-full" style="display: none;"> The number of end-to-end speech recognition models grows every year. These models are often adapted to new domains or languages resulting in a proliferation of expert systems that achieve great results on target data, while generally showing inferior performance outside of their domain of expertise. We explore combination of such experts via confidence-based ensembles: ensembles of models where only the output of the most-confident model is used. We assume that models' target data is not available except for a small validation set. We demonstrate effectiveness of our approach with two applications. First, we show that a confidence-based ensemble of 5 monolingual models outperforms a system where model selection is performed via a dedicated language identification block. Second, we demonstrate that it is possible to combine base and adapted models to achieve strong results on both original and target data. We validate all our results on multiple datasets and model architectures. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.15824v1-abstract-full').style.display = 'none'; document.getElementById('2306.15824v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To appear in Proc. INTERSPEECH 2023, August 20-24, 2023, Dublin, Ireland</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.08753">arXiv:2306.08753</a> <span> [<a href="https://arxiv.org/pdf/2306.08753">pdf</a>, <a href="https://arxiv.org/format/2306.08753">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Unified model for code-switching speech recognition and language identification based on a concatenated tokenizer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Dhawan%2C+K">Kunal Dhawan</a>, <a href="/search/eess?searchtype=author&query=Rekesh%2C+D">Dima Rekesh</a>, <a href="/search/eess?searchtype=author&query=Ginsburg%2C+B">Boris Ginsburg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.08753v3-abstract-short" style="display: inline;"> Code-Switching (CS) multilingual Automatic Speech Recognition (ASR) models can transcribe speech containing two or more alternating languages during a conversation. This paper proposes (1) a new method for creating code-switching ASR datasets from purely monolingual data sources, and (2) a novel Concatenated Tokenizer that enables ASR models to generate language ID for each emitted text token whil… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.08753v3-abstract-full').style.display = 'inline'; document.getElementById('2306.08753v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.08753v3-abstract-full" style="display: none;"> Code-Switching (CS) multilingual Automatic Speech Recognition (ASR) models can transcribe speech containing two or more alternating languages during a conversation. This paper proposes (1) a new method for creating code-switching ASR datasets from purely monolingual data sources, and (2) a novel Concatenated Tokenizer that enables ASR models to generate language ID for each emitted text token while reusing existing monolingual tokenizers. The efficacy of these approaches for building CS ASR models is demonstrated for two language pairs, English-Hindi and English-Spanish, where we achieve new state-of-the-art results on the Miami Bangor CS evaluation corpus. In addition to competitive ASR performance, the proposed Concatenated Tokenizer models are highly effective for spoken language identification, achieving 98%+ accuracy on the out-of-distribution FLEURS dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.08753v3-abstract-full').style.display = 'none'; document.getElementById('2306.08753v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.02317">arXiv:2306.02317</a> <span> [<a href="https://arxiv.org/pdf/2306.02317">pdf</a>, <a href="https://arxiv.org/format/2306.02317">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> SpellMapper: A non-autoregressive neural spellchecker for ASR customization with candidate retrieval based on n-gram mappings </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Antonova%2C+A">Alexandra Antonova</a>, <a href="/search/eess?searchtype=author&query=Bakhturina%2C+E">Evelina Bakhturina</a>, <a href="/search/eess?searchtype=author&query=Ginsburg%2C+B">Boris Ginsburg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.02317v1-abstract-short" style="display: inline;"> Contextual spelling correction models are an alternative to shallow fusion to improve automatic speech recognition (ASR) quality given user vocabulary. To deal with large user vocabularies, most of these models include candidate retrieval mechanisms, usually based on minimum edit distance between fragments of ASR hypothesis and user phrases. However, the edit-distance approach is slow, non-trainab… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.02317v1-abstract-full').style.display = 'inline'; document.getElementById('2306.02317v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.02317v1-abstract-full" style="display: none;"> Contextual spelling correction models are an alternative to shallow fusion to improve automatic speech recognition (ASR) quality given user vocabulary. To deal with large user vocabularies, most of these models include candidate retrieval mechanisms, usually based on minimum edit distance between fragments of ASR hypothesis and user phrases. However, the edit-distance approach is slow, non-trainable, and may have low recall as it relies only on common letters. We propose: 1) a novel algorithm for candidate retrieval, based on misspelled n-gram mappings, which gives up to 90% recall with just the top 10 candidates on Spoken Wikipedia; 2) a non-autoregressive neural model based on BERT architecture, where the initial transcript and ten candidates are combined into one input. The experiments on Spoken Wikipedia show 21.4% word error rate improvement compared to a baseline ASR system. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.02317v1-abstract-full').style.display = 'none'; document.getElementById('2306.02317v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by INTERSPEECH 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.05084">arXiv:2305.05084</a> <span> [<a href="https://arxiv.org/pdf/2305.05084">pdf</a>, <a href="https://arxiv.org/format/2305.05084">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Fast Conformer with Linearly Scalable Attention for Efficient Speech Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Rekesh%2C+D">Dima Rekesh</a>, <a href="/search/eess?searchtype=author&query=Koluguri%2C+N+R">Nithin Rao Koluguri</a>, <a href="/search/eess?searchtype=author&query=Kriman%2C+S">Samuel Kriman</a>, <a href="/search/eess?searchtype=author&query=Majumdar%2C+S">Somshubra Majumdar</a>, <a href="/search/eess?searchtype=author&query=Noroozi%2C+V">Vahid Noroozi</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+H">He Huang</a>, <a href="/search/eess?searchtype=author&query=Hrinchuk%2C+O">Oleksii Hrinchuk</a>, <a href="/search/eess?searchtype=author&query=Puvvada%2C+K">Krishna Puvvada</a>, <a href="/search/eess?searchtype=author&query=Kumar%2C+A">Ankur Kumar</a>, <a href="/search/eess?searchtype=author&query=Balam%2C+J">Jagadeesh Balam</a>, <a href="/search/eess?searchtype=author&query=Ginsburg%2C+B">Boris Ginsburg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.05084v6-abstract-short" style="display: inline;"> Conformer-based models have become the dominant end-to-end architecture for speech processing tasks. With the objective of enhancing the conformer architecture for efficient training and inference, we carefully redesigned Conformer with a novel downsampling schema. The proposed model, named Fast Conformer(FC), is 2.8x faster than the original Conformer, supports scaling to Billion parameters witho… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.05084v6-abstract-full').style.display = 'inline'; document.getElementById('2305.05084v6-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.05084v6-abstract-full" style="display: none;"> Conformer-based models have become the dominant end-to-end architecture for speech processing tasks. With the objective of enhancing the conformer architecture for efficient training and inference, we carefully redesigned Conformer with a novel downsampling schema. The proposed model, named Fast Conformer(FC), is 2.8x faster than the original Conformer, supports scaling to Billion parameters without any changes to the core architecture and also achieves state-of-the-art accuracy on Automatic Speech Recognition benchmarks. To enable transcription of long-form speech up to 11 hours, we replaced global attention with limited context attention post-training, while also improving accuracy through fine-tuning with the addition of a global token. Fast Conformer, when combined with a Transformer decoder also outperforms the original Conformer in accuracy and in speed for Speech Translation and Spoken Language Understanding. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.05084v6-abstract-full').style.display = 'none'; document.getElementById('2305.05084v6-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at ASRU 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2304.06795">arXiv:2304.06795</a> <span> [<a href="https://arxiv.org/pdf/2304.06795">pdf</a>, <a href="https://arxiv.org/format/2304.06795">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Efficient Sequence Transduction by Jointly Predicting Tokens and Durations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Xu%2C+H">Hainan Xu</a>, <a href="/search/eess?searchtype=author&query=Jia%2C+F">Fei Jia</a>, <a href="/search/eess?searchtype=author&query=Majumdar%2C+S">Somshubra Majumdar</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+H">He Huang</a>, <a href="/search/eess?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a>, <a href="/search/eess?searchtype=author&query=Ginsburg%2C+B">Boris Ginsburg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2304.06795v2-abstract-short" style="display: inline;"> This paper introduces a novel Token-and-Duration Transducer (TDT) architecture for sequence-to-sequence tasks. TDT extends conventional RNN-Transducer architectures by jointly predicting both a token and its duration, i.e. the number of input frames covered by the emitted token. This is achieved by using a joint network with two outputs which are independently normalized to generate distributions… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.06795v2-abstract-full').style.display = 'inline'; document.getElementById('2304.06795v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2304.06795v2-abstract-full" style="display: none;"> This paper introduces a novel Token-and-Duration Transducer (TDT) architecture for sequence-to-sequence tasks. TDT extends conventional RNN-Transducer architectures by jointly predicting both a token and its duration, i.e. the number of input frames covered by the emitted token. This is achieved by using a joint network with two outputs which are independently normalized to generate distributions over tokens and durations. During inference, TDT models can skip input frames guided by the predicted duration output, which makes them significantly faster than conventional Transducers which process the encoder output frame by frame. TDT models achieve both better accuracy and significantly faster inference than conventional Transducers on different sequence transduction tasks. TDT models for Speech Recognition achieve better accuracy and up to 2.82X faster inference than conventional Transducers. TDT models for Speech Translation achieve an absolute gain of over 1 BLEU on the MUST-C test compared with conventional Transducers, and its inference is 2.27X faster. In Speech Intent Classification and Slot Filling tasks, TDT models improve the intent accuracy by up to over 1% (absolute) over conventional Transducers, while running up to 1.28X faster. Our implementation of the TDT model will be open-sourced with the NeMo (https://github.com/NVIDIA/NeMo) toolkit. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.06795v2-abstract-full').style.display = 'none'; document.getElementById('2304.06795v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 April, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2303.10384">arXiv:2303.10384</a> <span> [<a href="https://arxiv.org/pdf/2303.10384">pdf</a>, <a href="https://arxiv.org/format/2303.10384">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/ICASSP49357.2023.10096679">10.1109/ICASSP49357.2023.10096679 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Powerful and Extensible WFST Framework for RNN-Transducer Losses </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Laptev%2C+A">Aleksandr Laptev</a>, <a href="/search/eess?searchtype=author&query=Bataev%2C+V">Vladimir Bataev</a>, <a href="/search/eess?searchtype=author&query=Gitman%2C+I">Igor Gitman</a>, <a href="/search/eess?searchtype=author&query=Ginsburg%2C+B">Boris Ginsburg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2303.10384v1-abstract-short" style="display: inline;"> This paper presents a framework based on Weighted Finite-State Transducers (WFST) to simplify the development of modifications for RNN-Transducer (RNN-T) loss. Existing implementations of RNN-T use CUDA-related code, which is hard to extend and debug. WFSTs are easy to construct and extend, and allow debugging through visualization. We introduce two WFST-powered RNN-T implementations: (1) "Compose… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.10384v1-abstract-full').style.display = 'inline'; document.getElementById('2303.10384v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2303.10384v1-abstract-full" style="display: none;"> This paper presents a framework based on Weighted Finite-State Transducers (WFST) to simplify the development of modifications for RNN-Transducer (RNN-T) loss. Existing implementations of RNN-T use CUDA-related code, which is hard to extend and debug. WFSTs are easy to construct and extend, and allow debugging through visualization. We introduce two WFST-powered RNN-T implementations: (1) "Compose-Transducer", based on a composition of the WFST graphs from acoustic and textual schema -- computationally competitive and easy to modify; (2) "Grid-Transducer", which constructs the lattice directly for further computations -- most compact, and computationally efficient. We illustrate the ease of extensibility through introduction of a new W-Transducer loss -- the adaptation of the Connectionist Temporal Classification with Wild Cards. W-Transducer (W-RNNT) consistently outperforms the standard RNN-T in a weakly-supervised data setup with missing parts of transcriptions at the beginning and end of utterances. All RNN-T losses are implemented with the k2 framework and are available in the NeMo toolkit. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.10384v1-abstract-full').style.display = 'none'; document.getElementById('2303.10384v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To appear in Proc. ICASSP 2023, June 04-10, 2023, Rhodes island, Greece. 5 pages, 5 figures, 3 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2303.07578">arXiv:2303.07578</a> <span> [<a href="https://arxiv.org/pdf/2303.07578">pdf</a>, <a href="https://arxiv.org/ps/2303.07578">ps</a>, <a href="https://arxiv.org/format/2303.07578">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> VANI: Very-lightweight Accent-controllable TTS for Native and Non-native speakers with Identity Preservation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Badlani%2C+R">Rohan Badlani</a>, <a href="/search/eess?searchtype=author&query=Arora%2C+A">Akshit Arora</a>, <a href="/search/eess?searchtype=author&query=Ghosh%2C+S">Subhankar Ghosh</a>, <a href="/search/eess?searchtype=author&query=Valle%2C+R">Rafael Valle</a>, <a href="/search/eess?searchtype=author&query=Shih%2C+K+J">Kevin J. Shih</a>, <a href="/search/eess?searchtype=author&query=Santos%2C+J+F">Jo茫o Felipe Santos</a>, <a href="/search/eess?searchtype=author&query=Ginsburg%2C+B">Boris Ginsburg</a>, <a href="/search/eess?searchtype=author&query=Catanzaro%2C+B">Bryan Catanzaro</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2303.07578v1-abstract-short" style="display: inline;"> We introduce VANI, a very lightweight multi-lingual accent controllable speech synthesis system. Our model builds upon disentanglement strategies proposed in RADMMM and supports explicit control of accent, language, speaker and fine-grained $F_0$ and energy features for speech synthesis. We utilize the Indic languages dataset, released for LIMMITS 2023 as part of ICASSP Signal Processing Grand Cha… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.07578v1-abstract-full').style.display = 'inline'; document.getElementById('2303.07578v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2303.07578v1-abstract-full" style="display: none;"> We introduce VANI, a very lightweight multi-lingual accent controllable speech synthesis system. Our model builds upon disentanglement strategies proposed in RADMMM and supports explicit control of accent, language, speaker and fine-grained $F_0$ and energy features for speech synthesis. We utilize the Indic languages dataset, released for LIMMITS 2023 as part of ICASSP Signal Processing Grand Challenge, to synthesize speech in 3 different languages. Our model supports transferring the language of a speaker while retaining their voice and the native accent of the target language. We utilize the large-parameter RADMMM model for Track $1$ and lightweight VANI model for Track $2$ and $3$ of the competition. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.07578v1-abstract-full').style.display = 'none'; document.getElementById('2303.07578v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Presentation accepted at ICASSP 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2302.14036">arXiv:2302.14036</a> <span> [<a href="https://arxiv.org/pdf/2302.14036">pdf</a>, <a href="https://arxiv.org/format/2302.14036">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.21437/interspeech.2023-906">10.21437/interspeech.2023-906 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Text-only domain adaptation for end-to-end ASR using integrated text-to-mel-spectrogram generator </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Bataev%2C+V">Vladimir Bataev</a>, <a href="/search/eess?searchtype=author&query=Korostik%2C+R">Roman Korostik</a>, <a href="/search/eess?searchtype=author&query=Shabalin%2C+E">Evgeny Shabalin</a>, <a href="/search/eess?searchtype=author&query=Lavrukhin%2C+V">Vitaly Lavrukhin</a>, <a href="/search/eess?searchtype=author&query=Ginsburg%2C+B">Boris Ginsburg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2302.14036v2-abstract-short" style="display: inline;"> We propose an end-to-end Automatic Speech Recognition (ASR) system that can be trained on transcribed speech data, text-only data, or a mixture of both. The proposed model uses an integrated auxiliary block for text-based training. This block combines a non-autoregressive multi-speaker text-to-mel-spectrogram generator with a GAN-based enhancer to improve the spectrogram quality. The proposed syst… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.14036v2-abstract-full').style.display = 'inline'; document.getElementById('2302.14036v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2302.14036v2-abstract-full" style="display: none;"> We propose an end-to-end Automatic Speech Recognition (ASR) system that can be trained on transcribed speech data, text-only data, or a mixture of both. The proposed model uses an integrated auxiliary block for text-based training. This block combines a non-autoregressive multi-speaker text-to-mel-spectrogram generator with a GAN-based enhancer to improve the spectrogram quality. The proposed system can generate a mel-spectrogram dynamically during training. It can be used to adapt the ASR model to a new domain by using text-only data from this domain. We demonstrate that the proposed training method significantly improves ASR accuracy compared to the system trained on transcribed speech only. It also surpasses cascade TTS systems with the vocoder in the adaptation quality and training speed. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.14036v2-abstract-full').style.display = 'none'; document.getElementById('2302.14036v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 February, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to INTERSPEECH 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2302.08137">arXiv:2302.08137</a> <span> [<a href="https://arxiv.org/pdf/2302.08137">pdf</a>, <a href="https://arxiv.org/format/2302.08137">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> ACE-VC: Adaptive and Controllable Voice Conversion using Explicitly Disentangled Self-supervised Speech Representations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Hussain%2C+S">Shehzeen Hussain</a>, <a href="/search/eess?searchtype=author&query=Neekhara%2C+P">Paarth Neekhara</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+J">Jocelyn Huang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+J">Jason Li</a>, <a href="/search/eess?searchtype=author&query=Ginsburg%2C+B">Boris Ginsburg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2302.08137v1-abstract-short" style="display: inline;"> In this work, we propose a zero-shot voice conversion method using speech representations trained with self-supervised learning. First, we develop a multi-task model to decompose a speech utterance into features such as linguistic content, speaker characteristics, and speaking style. To disentangle content and speaker representations, we propose a training strategy based on Siamese networks that e… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.08137v1-abstract-full').style.display = 'inline'; document.getElementById('2302.08137v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2302.08137v1-abstract-full" style="display: none;"> In this work, we propose a zero-shot voice conversion method using speech representations trained with self-supervised learning. First, we develop a multi-task model to decompose a speech utterance into features such as linguistic content, speaker characteristics, and speaking style. To disentangle content and speaker representations, we propose a training strategy based on Siamese networks that encourages similarity between the content representations of the original and pitch-shifted audio. Next, we develop a synthesis model with pitch and duration predictors that can effectively reconstruct the speech signal from its decomposed representation. Our framework allows controllable and speaker-adaptive synthesis to perform zero-shot any-to-any voice conversion achieving state-of-the-art results on metrics evaluating speaker similarity, intelligibility, and naturalness. Using just 10 seconds of data for a target speaker, our framework can perform voice swapping and achieves a speaker verification EER of 5.5% for seen speakers and 8.4% for unseen speakers. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.08137v1-abstract-full').style.display = 'none'; document.getElementById('2302.08137v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 February, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published as a conference paper at ICASSP 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2212.08703">arXiv:2212.08703</a> <span> [<a href="https://arxiv.org/pdf/2212.08703">pdf</a>, <a href="https://arxiv.org/format/2212.08703">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/SLT54892.2023.10022960">10.1109/SLT54892.2023.10022960 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Fast Entropy-Based Methods of Word-Level Confidence Estimation for End-To-End Automatic Speech Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Laptev%2C+A">Aleksandr Laptev</a>, <a href="/search/eess?searchtype=author&query=Ginsburg%2C+B">Boris Ginsburg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2212.08703v1-abstract-short" style="display: inline;"> This paper presents a class of new fast non-trainable entropy-based confidence estimation methods for automatic speech recognition. We show how per-frame entropy values can be normalized and aggregated to obtain a confidence measure per unit and per word for Connectionist Temporal Classification (CTC) and Recurrent Neural Network Transducer (RNN-T) models. Proposed methods have similar computation… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.08703v1-abstract-full').style.display = 'inline'; document.getElementById('2212.08703v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2212.08703v1-abstract-full" style="display: none;"> This paper presents a class of new fast non-trainable entropy-based confidence estimation methods for automatic speech recognition. We show how per-frame entropy values can be normalized and aggregated to obtain a confidence measure per unit and per word for Connectionist Temporal Classification (CTC) and Recurrent Neural Network Transducer (RNN-T) models. Proposed methods have similar computational complexity to the traditional method based on the maximum per-frame probability, but they are more adjustable, have a wider effective threshold range, and better push apart the confidence distributions of correct and incorrect words. We evaluate the proposed confidence measures on LibriSpeech test sets, and show that they are up to 2 and 4 times better than confidence estimation based on the maximum per-frame probability at detecting incorrect words for Conformer-CTC and Conformer-RNN-T models, respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.08703v1-abstract-full').style.display = 'none'; document.getElementById('2212.08703v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 December, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To appear in Proc. SLT 2022, Jan 09-12, 2023, Doha, Qatar. 8 pages, 4 figures, 4 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2211.05103">arXiv:2211.05103</a> <span> [<a href="https://arxiv.org/pdf/2211.05103">pdf</a>, <a href="https://arxiv.org/ps/2211.05103">ps</a>, <a href="https://arxiv.org/format/2211.05103">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Accidental Learners: Spoken Language Identification in Multilingual Self-Supervised Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Bartley%2C+T+M">Travis M. Bartley</a>, <a href="/search/eess?searchtype=author&query=Jia%2C+F">Fei Jia</a>, <a href="/search/eess?searchtype=author&query=Puvvada%2C+K+C">Krishna C. Puvvada</a>, <a href="/search/eess?searchtype=author&query=Kriman%2C+S">Samuel Kriman</a>, <a href="/search/eess?searchtype=author&query=Ginsburg%2C+B">Boris Ginsburg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2211.05103v2-abstract-short" style="display: inline;"> In this paper, we extend previous self-supervised approaches for language identification by experimenting with Conformer based architecture in a multilingual pre-training paradigm. We find that pre-trained speech models optimally encode language discriminatory information in lower layers. Further, we demonstrate that the embeddings obtained from these layers are significantly robust to classify un… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.05103v2-abstract-full').style.display = 'inline'; document.getElementById('2211.05103v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2211.05103v2-abstract-full" style="display: none;"> In this paper, we extend previous self-supervised approaches for language identification by experimenting with Conformer based architecture in a multilingual pre-training paradigm. We find that pre-trained speech models optimally encode language discriminatory information in lower layers. Further, we demonstrate that the embeddings obtained from these layers are significantly robust to classify unseen languages and different acoustic environments without additional training. After fine-tuning a pre-trained Conformer model on the VoxLingua107 dataset, we achieve results similar to current state-of-the-art systems for language identification. More, our model accomplishes this with 5x less parameters. We open-source the model through the NVIDIA NeMo toolkit. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.05103v2-abstract-full').style.display = 'none'; document.getElementById('2211.05103v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to ICASSP 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2211.03541">arXiv:2211.03541</a> <span> [<a href="https://arxiv.org/pdf/2211.03541">pdf</a>, <a href="https://arxiv.org/format/2211.03541">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Multi-blank Transducers for Speech Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Xu%2C+H">Hainan Xu</a>, <a href="/search/eess?searchtype=author&query=Jia%2C+F">Fei Jia</a>, <a href="/search/eess?searchtype=author&query=Majumdar%2C+S">Somshubra Majumdar</a>, <a href="/search/eess?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a>, <a href="/search/eess?searchtype=author&query=Ginsburg%2C+B">Boris Ginsburg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2211.03541v2-abstract-short" style="display: inline;"> This paper proposes a modification to RNN-Transducer (RNN-T) models for automatic speech recognition (ASR). In standard RNN-T, the emission of a blank symbol consumes exactly one input frame; in our proposed method, we introduce additional blank symbols, which consume two or more input frames when emitted. We refer to the added symbols as big blanks, and the method multi-blank RNN-T. For training… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.03541v2-abstract-full').style.display = 'inline'; document.getElementById('2211.03541v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2211.03541v2-abstract-full" style="display: none;"> This paper proposes a modification to RNN-Transducer (RNN-T) models for automatic speech recognition (ASR). In standard RNN-T, the emission of a blank symbol consumes exactly one input frame; in our proposed method, we introduce additional blank symbols, which consume two or more input frames when emitted. We refer to the added symbols as big blanks, and the method multi-blank RNN-T. For training multi-blank RNN-Ts, we propose a novel logit under-normalization method in order to prioritize emissions of big blanks. With experiments on multiple languages and datasets, we show that multi-blank RNN-T methods could bring relative speedups of over +90%/+139% to model inference for English Librispeech and German Multilingual Librispeech datasets, respectively. The multi-blank RNN-T method also improves ASR accuracy consistently. We will release our implementation of the method in the NeMo (https://github.com/NVIDIA/NeMo) toolkit. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.03541v2-abstract-full').style.display = 'none'; document.getElementById('2211.03541v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> ICASSP 2023 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2211.00585">arXiv:2211.00585</a> <span> [<a href="https://arxiv.org/pdf/2211.00585">pdf</a>, <a href="https://arxiv.org/format/2211.00585">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Adapter-Based Extension of Multi-Speaker Text-to-Speech Model for New Speakers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Hsieh%2C+C">Cheng-Ping Hsieh</a>, <a href="/search/eess?searchtype=author&query=Ghosh%2C+S">Subhankar Ghosh</a>, <a href="/search/eess?searchtype=author&query=Ginsburg%2C+B">Boris Ginsburg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2211.00585v1-abstract-short" style="display: inline;"> Fine-tuning is a popular method for adapting text-to-speech (TTS) models to new speakers. However this approach has some challenges. Usually fine-tuning requires several hours of high quality speech per speaker. There is also that fine-tuning will negatively affect the quality of speech synthesis for previously learnt speakers. In this paper we propose an alternative approach for TTS adaptation ba… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.00585v1-abstract-full').style.display = 'inline'; document.getElementById('2211.00585v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2211.00585v1-abstract-full" style="display: none;"> Fine-tuning is a popular method for adapting text-to-speech (TTS) models to new speakers. However this approach has some challenges. Usually fine-tuning requires several hours of high quality speech per speaker. There is also that fine-tuning will negatively affect the quality of speech synthesis for previously learnt speakers. In this paper we propose an alternative approach for TTS adaptation based on using parameter-efficient adapter modules. In the proposed approach, a few small adapter modules are added to the original network. The original weights are frozen, and only the adapters are fine-tuned on speech for new speaker. The parameter-efficient fine-tuning approach will produce a new model with high level of parameter sharing with original model. Our experiments on LibriTTS, HiFi-TTS and VCTK datasets validate the effectiveness of adapter-based method through objective and subjective metrics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.00585v1-abstract-full').style.display = 'none'; document.getElementById('2211.00585v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to ICASSP 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2210.15781">arXiv:2210.15781</a> <span> [<a href="https://arxiv.org/pdf/2210.15781">pdf</a>, <a href="https://arxiv.org/format/2210.15781">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> A Compact End-to-End Model with Local and Global Context for Spoken Language Identification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Jia%2C+F">Fei Jia</a>, <a href="/search/eess?searchtype=author&query=Koluguri%2C+N+R">Nithin Rao Koluguri</a>, <a href="/search/eess?searchtype=author&query=Balam%2C+J">Jagadeesh Balam</a>, <a href="/search/eess?searchtype=author&query=Ginsburg%2C+B">Boris Ginsburg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2210.15781v2-abstract-short" style="display: inline;"> We introduce TitaNet-LID, a compact end-to-end neural network for Spoken Language Identification (LID) that is based on the ContextNet architecture. TitaNet-LID employs 1D depth-wise separable convolutions and Squeeze-and-Excitation layers to effectively capture local and global context within an utterance. Despite its small size, TitaNet-LID achieves performance similar to state-of-the-art models… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.15781v2-abstract-full').style.display = 'inline'; document.getElementById('2210.15781v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2210.15781v2-abstract-full" style="display: none;"> We introduce TitaNet-LID, a compact end-to-end neural network for Spoken Language Identification (LID) that is based on the ContextNet architecture. TitaNet-LID employs 1D depth-wise separable convolutions and Squeeze-and-Excitation layers to effectively capture local and global context within an utterance. Despite its small size, TitaNet-LID achieves performance similar to state-of-the-art models on the VoxLingua107 dataset while being 10 times smaller. Furthermore, it can be easily adapted to new acoustic conditions and unseen languages through simple fine-tuning, achieving a state-of-the-art accuracy of 88.2% on the FLEURS benchmark. Our model is scalable and can achieve a better trade-off between accuracy and speed. TitaNet-LID performs well even on short utterances less than 5s in length, indicating its robustness to input length. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.15781v2-abstract-full').style.display = 'none'; document.getElementById('2210.15781v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to INTERSPEECH 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2210.03255">arXiv:2210.03255</a> <span> [<a href="https://arxiv.org/pdf/2210.03255">pdf</a>, <a href="https://arxiv.org/format/2210.03255">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Damage Control During Domain Adaptation for Transducer Based Automatic Speech Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Majumdar%2C+S">Somshubra Majumdar</a>, <a href="/search/eess?searchtype=author&query=Acharya%2C+S">Shantanu Acharya</a>, <a href="/search/eess?searchtype=author&query=Lavrukhin%2C+V">Vitaly Lavrukhin</a>, <a href="/search/eess?searchtype=author&query=Ginsburg%2C+B">Boris Ginsburg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2210.03255v1-abstract-short" style="display: inline;"> Automatic speech recognition models are often adapted to improve their accuracy in a new domain. A potential drawback of model adaptation to new domains is catastrophic forgetting, where the Word Error Rate on the original domain is significantly degraded. This paper addresses the situation when we want to simultaneously adapt automatic speech recognition models to a new domain and limit the degra… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.03255v1-abstract-full').style.display = 'inline'; document.getElementById('2210.03255v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2210.03255v1-abstract-full" style="display: none;"> Automatic speech recognition models are often adapted to improve their accuracy in a new domain. A potential drawback of model adaptation to new domains is catastrophic forgetting, where the Word Error Rate on the original domain is significantly degraded. This paper addresses the situation when we want to simultaneously adapt automatic speech recognition models to a new domain and limit the degradation of accuracy on the original domain without access to the original training dataset. We propose several techniques such as a limited training strategy and regularized adapter modules for the Transducer encoder, prediction, and joiner network. We apply these methods to the Google Speech Commands and to the UK and Ireland English Dialect speech data set and obtain strong results on the new target domain while limiting the degradation on the original domain. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.03255v1-abstract-full').style.display = 'none'; document.getElementById('2210.03255v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To appear in Proc. SLT 2022, Jan 09-12, 2023, Doha, Qatar</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2206.04658">arXiv:2206.04658</a> <span> [<a href="https://arxiv.org/pdf/2206.04658">pdf</a>, <a href="https://arxiv.org/format/2206.04658">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> BigVGAN: A Universal Neural Vocoder with Large-Scale Training </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Lee%2C+S">Sang-gil Lee</a>, <a href="/search/eess?searchtype=author&query=Ping%2C+W">Wei Ping</a>, <a href="/search/eess?searchtype=author&query=Ginsburg%2C+B">Boris Ginsburg</a>, <a href="/search/eess?searchtype=author&query=Catanzaro%2C+B">Bryan Catanzaro</a>, <a href="/search/eess?searchtype=author&query=Yoon%2C+S">Sungroh Yoon</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2206.04658v2-abstract-short" style="display: inline;"> Despite recent progress in generative adversarial network (GAN)-based vocoders, where the model generates raw waveform conditioned on acoustic features, it is challenging to synthesize high-fidelity audio for numerous speakers across various recording environments. In this work, we present BigVGAN, a universal vocoder that generalizes well for various out-of-distribution scenarios without fine-tun… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.04658v2-abstract-full').style.display = 'inline'; document.getElementById('2206.04658v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2206.04658v2-abstract-full" style="display: none;"> Despite recent progress in generative adversarial network (GAN)-based vocoders, where the model generates raw waveform conditioned on acoustic features, it is challenging to synthesize high-fidelity audio for numerous speakers across various recording environments. In this work, we present BigVGAN, a universal vocoder that generalizes well for various out-of-distribution scenarios without fine-tuning. We introduce periodic activation function and anti-aliased representation into the GAN generator, which brings the desired inductive bias for audio synthesis and significantly improves audio quality. In addition, we train our GAN vocoder at the largest scale up to 112M parameters, which is unprecedented in the literature. We identify and address the failure modes in large-scale GAN training for audio, while maintaining high-fidelity output without over-regularization. Our BigVGAN, trained only on clean speech (LibriTTS), achieves the state-of-the-art performance for various zero-shot (out-of-distribution) conditions, including unseen speakers, languages, recording environments, singing voices, music, and instrumental audio. We release our code and model at: https://github.com/NVIDIA/BigVGAN <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.04658v2-abstract-full').style.display = 'none'; document.getElementById('2206.04658v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 February, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 June, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To appear at ICLR 2023. Listen to audio samples from BigVGAN at: https://bigvgan-demo.github.io/</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Ginsburg%2C+B&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Ginsburg%2C+B&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Ginsburg%2C+B&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository