Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–34 of 34 results for author: <span class="mathjax">Zelasko, P</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/eess" aria-role="search"> Searching in archive <strong>eess</strong>. <a href="/search/?searchtype=author&query=Zelasko%2C+P">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Zelasko, P"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Zelasko%2C+P&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Zelasko, P"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05945">arXiv:2411.05945</a> <span> [<a href="https://arxiv.org/pdf/2411.05945">pdf</a>, <a href="https://arxiv.org/format/2411.05945">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> NeKo: Toward Post Recognition Generative Correction Large Language Models with Task-Oriented Experts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Lin%2C+Y">Yen-Ting Lin</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+C+H">Chao-Han Huck Yang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Z">Zhehuai Chen</a>, <a href="/search/eess?searchtype=author&query=Zelasko%2C+P">Piotr Zelasko</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+X">Xuesong Yang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Z">Zih-Ching Chen</a>, <a href="/search/eess?searchtype=author&query=Puvvada%2C+K+C">Krishna C Puvvada</a>, <a href="/search/eess?searchtype=author&query=Fu%2C+S">Szu-Wei Fu</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+K">Ke Hu</a>, <a href="/search/eess?searchtype=author&query=Chiu%2C+J+W">Jun Wei Chiu</a>, <a href="/search/eess?searchtype=author&query=Balam%2C+J">Jagadeesh Balam</a>, <a href="/search/eess?searchtype=author&query=Ginsburg%2C+B">Boris Ginsburg</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y+F">Yu-Chiang Frank Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05945v1-abstract-short" style="display: inline;"> Construction of a general-purpose post-recognition error corrector poses a crucial question: how can we most effectively train a model on a large mixture of domain datasets? The answer would lie in learning dataset-specific features and digesting their knowledge in a single model. Previous methods achieve this by having separate correction language models, resulting in a significant increase in pa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05945v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05945v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05945v1-abstract-full" style="display: none;"> Construction of a general-purpose post-recognition error corrector poses a crucial question: how can we most effectively train a model on a large mixture of domain datasets? The answer would lie in learning dataset-specific features and digesting their knowledge in a single model. Previous methods achieve this by having separate correction language models, resulting in a significant increase in parameters. In this work, we present Mixture-of-Experts as a solution, highlighting that MoEs are much more than a scalability tool. We propose a Multi-Task Correction MoE, where we train the experts to become an ``expert'' of speech-to-text, language-to-text and vision-to-text datasets by learning to route each dataset's tokens to its mapped expert. Experiments on the Open ASR Leaderboard show that we explore a new state-of-the-art performance by achieving an average relative $5.0$% WER reduction and substantial improvements in BLEU scores for speech and translation tasks. On zero-shot evaluation, NeKo outperforms GPT-3.5 and Claude-Opus with $15.5$% to $27.6$% relative WER reduction in the Hyporadise benchmark. NeKo performs competitively on grammar and post-OCR correction as a multi-task model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05945v1-abstract-full').style.display = 'none'; document.getElementById('2411.05945v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeKo work has been done in June 2024. NeKo LMs will be open source on https://huggingface.co/nvidia under the MIT license</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.17485">arXiv:2410.17485</a> <span> [<a href="https://arxiv.org/pdf/2410.17485">pdf</a>, <a href="https://arxiv.org/format/2410.17485">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> VoiceTextBlender: Augmenting Large Language Models with Speech Capabilities via Single-Stage Joint Speech-Text Supervised Fine-Tuning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Peng%2C+Y">Yifan Peng</a>, <a href="/search/eess?searchtype=author&query=Puvvada%2C+K+C">Krishna C. Puvvada</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Z">Zhehuai Chen</a>, <a href="/search/eess?searchtype=author&query=Zelasko%2C+P">Piotr Zelasko</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+H">He Huang</a>, <a href="/search/eess?searchtype=author&query=Dhawan%2C+K">Kunal Dhawan</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+K">Ke Hu</a>, <a href="/search/eess?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a>, <a href="/search/eess?searchtype=author&query=Balam%2C+J">Jagadeesh Balam</a>, <a href="/search/eess?searchtype=author&query=Ginsburg%2C+B">Boris Ginsburg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.17485v2-abstract-short" style="display: inline;"> Recent studies have augmented large language models (LLMs) with speech capabilities, leading to the development of speech language models (SpeechLMs). Earlier SpeechLMs focused on single-turn speech-based question answering (QA), where user input comprised a speech context and a text question. More recent studies have extended this to multi-turn conversations, though they often require complex, mu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17485v2-abstract-full').style.display = 'inline'; document.getElementById('2410.17485v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.17485v2-abstract-full" style="display: none;"> Recent studies have augmented large language models (LLMs) with speech capabilities, leading to the development of speech language models (SpeechLMs). Earlier SpeechLMs focused on single-turn speech-based question answering (QA), where user input comprised a speech context and a text question. More recent studies have extended this to multi-turn conversations, though they often require complex, multi-stage supervised fine-tuning (SFT) with diverse data. Another critical challenge with SpeechLMs is catastrophic forgetting, where models optimized for speech tasks suffer significant degradation in text-only performance. To mitigate these issues, we propose a novel single-stage joint speech-text SFT approach on the low-rank adaptation (LoRA) of the LLM backbone. Our joint SFT combines text-only SFT data with three types of speech-related data: speech recognition and translation, speech-based QA, and mixed-modal SFT. Compared to previous SpeechLMs with 7B or 13B parameters, our 3B model demonstrates superior performance across various speech benchmarks while preserving the original capabilities on text-only tasks. Furthermore, our model shows emergent abilities of effectively handling previously unseen prompts and tasks, including multi-turn, mixed-modal inputs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17485v2-abstract-full').style.display = 'none'; document.getElementById('2410.17485v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at NAACL 2025 main conference</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.13523">arXiv:2409.13523</a> <span> [<a href="https://arxiv.org/pdf/2409.13523">pdf</a>, <a href="https://arxiv.org/format/2409.13523">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> EMMeTT: Efficient Multimodal Machine Translation Training </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=%C5%BBelasko%2C+P">Piotr 呕elasko</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Z">Zhehuai Chen</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+M">Mengru Wang</a>, <a href="/search/eess?searchtype=author&query=Galvez%2C+D">Daniel Galvez</a>, <a href="/search/eess?searchtype=author&query=Hrinchuk%2C+O">Oleksii Hrinchuk</a>, <a href="/search/eess?searchtype=author&query=Ding%2C+S">Shuoyang Ding</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+K">Ke Hu</a>, <a href="/search/eess?searchtype=author&query=Balam%2C+J">Jagadeesh Balam</a>, <a href="/search/eess?searchtype=author&query=Lavrukhin%2C+V">Vitaly Lavrukhin</a>, <a href="/search/eess?searchtype=author&query=Ginsburg%2C+B">Boris Ginsburg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.13523v1-abstract-short" style="display: inline;"> A rising interest in the modality extension of foundation language models warrants discussion on the most effective, and efficient, multimodal training approach. This work focuses on neural machine translation (NMT) and proposes a joint multimodal training regime of Speech-LLM to include automatic speech translation (AST). We investigate two different foundation model architectures, decoder-only G… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13523v1-abstract-full').style.display = 'inline'; document.getElementById('2409.13523v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.13523v1-abstract-full" style="display: none;"> A rising interest in the modality extension of foundation language models warrants discussion on the most effective, and efficient, multimodal training approach. This work focuses on neural machine translation (NMT) and proposes a joint multimodal training regime of Speech-LLM to include automatic speech translation (AST). We investigate two different foundation model architectures, decoder-only GPT and encoder-decoder T5, extended with Canary-1B's speech encoder. To handle joint multimodal training, we propose a novel training framework called EMMeTT. EMMeTT improves training efficiency with the following: balanced sampling across languages, datasets, and modalities; efficient sequential data iteration; and a novel 2D bucketing scheme for multimodal data, complemented by a batch size optimizer (OOMptimizer). We show that a multimodal training consistently helps with both architectures. Moreover, SALM-T5 trained with EMMeTT retains the original NMT capability while outperforming AST baselines on four-language subsets of FLORES and FLEURS. The resultant Multimodal Translation Model produces strong text and speech translation results at the same time. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13523v1-abstract-full').style.display = 'none'; document.getElementById('2409.13523v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">4 pages, submitted to ICASSP 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.09785">arXiv:2409.09785</a> <span> [<a href="https://arxiv.org/pdf/2409.09785">pdf</a>, <a href="https://arxiv.org/format/2409.09785">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Large Language Model Based Generative Error Correction: A Challenge and Baselines for Speech Recognition, Speaker Tagging, and Emotion Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yang%2C+C+H">Chao-Han Huck Yang</a>, <a href="/search/eess?searchtype=author&query=Park%2C+T">Taejin Park</a>, <a href="/search/eess?searchtype=author&query=Gong%2C+Y">Yuan Gong</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yuanchao Li</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Z">Zhehuai Chen</a>, <a href="/search/eess?searchtype=author&query=Lin%2C+Y">Yen-Ting Lin</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+C">Chen Chen</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+Y">Yuchen Hu</a>, <a href="/search/eess?searchtype=author&query=Dhawan%2C+K">Kunal Dhawan</a>, <a href="/search/eess?searchtype=author&query=%C5%BBelasko%2C+P">Piotr 呕elasko</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+C">Chao Zhang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yun-Nung Chen</a>, <a href="/search/eess?searchtype=author&query=Tsao%2C+Y">Yu Tsao</a>, <a href="/search/eess?searchtype=author&query=Balam%2C+J">Jagadeesh Balam</a>, <a href="/search/eess?searchtype=author&query=Ginsburg%2C+B">Boris Ginsburg</a>, <a href="/search/eess?searchtype=author&query=Siniscalchi%2C+S+M">Sabato Marco Siniscalchi</a>, <a href="/search/eess?searchtype=author&query=Chng%2C+E+S">Eng Siong Chng</a>, <a href="/search/eess?searchtype=author&query=Bell%2C+P">Peter Bell</a>, <a href="/search/eess?searchtype=author&query=Lai%2C+C">Catherine Lai</a>, <a href="/search/eess?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a>, <a href="/search/eess?searchtype=author&query=Stolcke%2C+A">Andreas Stolcke</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.09785v3-abstract-short" style="display: inline;"> Given recent advances in generative AI technology, a key question is how large language models (LLMs) can enhance acoustic modeling tasks using text decoding results from a frozen, pretrained automatic speech recognition (ASR) model. To explore new capabilities in language modeling for speech processing, we introduce the generative speech transcription error correction (GenSEC) challenge. This cha… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09785v3-abstract-full').style.display = 'inline'; document.getElementById('2409.09785v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.09785v3-abstract-full" style="display: none;"> Given recent advances in generative AI technology, a key question is how large language models (LLMs) can enhance acoustic modeling tasks using text decoding results from a frozen, pretrained automatic speech recognition (ASR) model. To explore new capabilities in language modeling for speech processing, we introduce the generative speech transcription error correction (GenSEC) challenge. This challenge comprises three post-ASR language modeling tasks: (i) post-ASR transcription correction, (ii) speaker tagging, and (iii) emotion recognition. These tasks aim to emulate future LLM-based agents handling voice-based interfaces while remaining accessible to a broad audience by utilizing open pretrained language models or agent-based APIs. We also discuss insights from baseline evaluations, as well as lessons learned for designing future evaluations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09785v3-abstract-full').style.display = 'none'; document.getElementById('2409.09785v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">IEEE SLT 2024. The initial draft version has been done in December 2023. Post-ASR Text Processing and Understanding Community and LlaMA-7B pre-training correction model: https://huggingface.co/GenSEC-LLM/SLT-Task1-Llama2-7b-HyPo-baseline</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.19954">arXiv:2406.19954</a> <span> [<a href="https://arxiv.org/pdf/2406.19954">pdf</a>, <a href="https://arxiv.org/format/2406.19954">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> BESTOW: Efficient and Streamable Speech Language Model with the Best of Two Worlds in GPT and T5 </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Chen%2C+Z">Zhehuai Chen</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+H">He Huang</a>, <a href="/search/eess?searchtype=author&query=Hrinchuk%2C+O">Oleksii Hrinchuk</a>, <a href="/search/eess?searchtype=author&query=Puvvada%2C+K+C">Krishna C. Puvvada</a>, <a href="/search/eess?searchtype=author&query=Koluguri%2C+N+R">Nithin Rao Koluguri</a>, <a href="/search/eess?searchtype=author&query=%C5%BBelasko%2C+P">Piotr 呕elasko</a>, <a href="/search/eess?searchtype=author&query=Balam%2C+J">Jagadeesh Balam</a>, <a href="/search/eess?searchtype=author&query=Ginsburg%2C+B">Boris Ginsburg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.19954v1-abstract-short" style="display: inline;"> Incorporating speech understanding capabilities into pretrained large-language models has become a vital research direction (SpeechLLM). The previous architectures can be categorized as: i) GPT-style, prepend speech prompts to the text prompts as a sequence of LLM inputs like a decoder-only model; ii) T5-style, introduce speech cross-attention to each layer of the pretrained LLMs. We propose BESTO… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.19954v1-abstract-full').style.display = 'inline'; document.getElementById('2406.19954v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.19954v1-abstract-full" style="display: none;"> Incorporating speech understanding capabilities into pretrained large-language models has become a vital research direction (SpeechLLM). The previous architectures can be categorized as: i) GPT-style, prepend speech prompts to the text prompts as a sequence of LLM inputs like a decoder-only model; ii) T5-style, introduce speech cross-attention to each layer of the pretrained LLMs. We propose BESTOW architecture to bring the BESt features from TwO Worlds into a single model that is highly efficient and has strong multitask capabilities. Moreover, there is no clear streaming solution for either style, especially considering the solution should generalize to speech multitask. We reformulate streamable SpeechLLM as a read-write policy problem and unifies the offline and streaming research with BESTOW architecture. Hence we demonstrate the first open-source SpeechLLM solution that enables Streaming and Multitask at scale (beyond ASR) at the same time. This streamable solution achieves very strong performance on a wide range of speech tasks (ASR, AST, SQA, unseen DynamicSuperb). It is end-to-end optimizable, with lower training/inference cost, and demonstrates LLM knowledge transferability to speech. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.19954v1-abstract-full').style.display = 'none'; document.getElementById('2406.19954v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68T10 <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.7 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.19674">arXiv:2406.19674</a> <span> [<a href="https://arxiv.org/pdf/2406.19674">pdf</a>, <a href="https://arxiv.org/format/2406.19674">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Less is More: Accurate Speech Recognition & Translation without Web-Scale Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Puvvada%2C+K+C">Krishna C. Puvvada</a>, <a href="/search/eess?searchtype=author&query=%C5%BBelasko%2C+P">Piotr 呕elasko</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+H">He Huang</a>, <a href="/search/eess?searchtype=author&query=Hrinchuk%2C+O">Oleksii Hrinchuk</a>, <a href="/search/eess?searchtype=author&query=Koluguri%2C+N+R">Nithin Rao Koluguri</a>, <a href="/search/eess?searchtype=author&query=Dhawan%2C+K">Kunal Dhawan</a>, <a href="/search/eess?searchtype=author&query=Majumdar%2C+S">Somshubra Majumdar</a>, <a href="/search/eess?searchtype=author&query=Rastorgueva%2C+E">Elena Rastorgueva</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Z">Zhehuai Chen</a>, <a href="/search/eess?searchtype=author&query=Lavrukhin%2C+V">Vitaly Lavrukhin</a>, <a href="/search/eess?searchtype=author&query=Balam%2C+J">Jagadeesh Balam</a>, <a href="/search/eess?searchtype=author&query=Ginsburg%2C+B">Boris Ginsburg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.19674v1-abstract-short" style="display: inline;"> Recent advances in speech recognition and translation rely on hundreds of thousands of hours of Internet speech data. We argue that state-of-the art accuracy can be reached without relying on web-scale data. Canary - multilingual ASR and speech translation model, outperforms current state-of-the-art models - Whisper, OWSM, and Seamless-M4T on English, French, Spanish, and German languages, while b… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.19674v1-abstract-full').style.display = 'inline'; document.getElementById('2406.19674v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.19674v1-abstract-full" style="display: none;"> Recent advances in speech recognition and translation rely on hundreds of thousands of hours of Internet speech data. We argue that state-of-the art accuracy can be reached without relying on web-scale data. Canary - multilingual ASR and speech translation model, outperforms current state-of-the-art models - Whisper, OWSM, and Seamless-M4T on English, French, Spanish, and German languages, while being trained on an order of magnitude less data than these models. Three key factors enables such data-efficient model: (1) a FastConformer-based attention encoder-decoder architecture (2) training on synthetic data generated with machine translation and (3) advanced training techniques: data-balancing, dynamic data blending, dynamic bucketing and noise-robust fine-tuning. The model, weights, and training code will be open-sourced. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.19674v1-abstract-full').style.display = 'none'; document.getElementById('2406.19674v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at Interspeech-2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2304.05974">arXiv:2304.05974</a> <span> [<a href="https://arxiv.org/pdf/2304.05974">pdf</a>, <a href="https://arxiv.org/format/2304.05974">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Regularizing Contrastive Predictive Coding for Speech Applications </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Bhati%2C+S">Saurabhchand Bhati</a>, <a href="/search/eess?searchtype=author&query=Villalba%2C+J">Jes煤s Villalba</a>, <a href="/search/eess?searchtype=author&query=%C5%BBelasko%2C+P">Piotr 呕elasko</a>, <a href="/search/eess?searchtype=author&query=Moro-Velazquez%2C+L">Laureano Moro-Velazquez</a>, <a href="/search/eess?searchtype=author&query=Dehak%2C+N">Najim Dehak</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2304.05974v2-abstract-short" style="display: inline;"> Self-supervised methods such as Contrastive predictive Coding (CPC) have greatly improved the quality of the unsupervised representations. These representations significantly reduce the amount of labeled data needed for downstream task performance, such as automatic speech recognition. CPC learns representations by learning to predict future frames given current frames. Based on the observation th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.05974v2-abstract-full').style.display = 'inline'; document.getElementById('2304.05974v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2304.05974v2-abstract-full" style="display: none;"> Self-supervised methods such as Contrastive predictive Coding (CPC) have greatly improved the quality of the unsupervised representations. These representations significantly reduce the amount of labeled data needed for downstream task performance, such as automatic speech recognition. CPC learns representations by learning to predict future frames given current frames. Based on the observation that the acoustic information, e.g., phones, changes slower than the feature extraction rate in CPC, we propose regularization techniques that impose slowness constraints on the features. Here we propose two regularization techniques: Self-expressing constraint and Left-or-Right regularization. We evaluate the proposed model on ABX and linear phone classification tasks, acoustic unit discovery, and automatic speech recognition. The regularized CPC trained on 100 hours of unlabeled data matches the performance of the baseline CPC trained on 360 hours of unlabeled data. We also show that our regularization techniques are complementary to data augmentation and can further boost the system's performance. In monolingual, cross-lingual, or multilingual settings, with/without data augmentation, regardless of the amount of data used for training, our regularized models outperformed the baseline CPC models on the ABX task. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.05974v2-abstract-full').style.display = 'none'; document.getElementById('2304.05974v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 April, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 April, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2211.00508">arXiv:2211.00508</a> <span> [<a href="https://arxiv.org/pdf/2211.00508">pdf</a>, <a href="https://arxiv.org/format/2211.00508">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Predicting Multi-Codebook Vector Quantization Indexes for Knowledge Distillation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Guo%2C+L">Liyong Guo</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+X">Xiaoyu Yang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Q">Quandong Wang</a>, <a href="/search/eess?searchtype=author&query=Kong%2C+Y">Yuxiang Kong</a>, <a href="/search/eess?searchtype=author&query=Yao%2C+Z">Zengwei Yao</a>, <a href="/search/eess?searchtype=author&query=Cui%2C+F">Fan Cui</a>, <a href="/search/eess?searchtype=author&query=Kuang%2C+F">Fangjun Kuang</a>, <a href="/search/eess?searchtype=author&query=Kang%2C+W">Wei Kang</a>, <a href="/search/eess?searchtype=author&query=Lin%2C+L">Long Lin</a>, <a href="/search/eess?searchtype=author&query=Luo%2C+M">Mingshuang Luo</a>, <a href="/search/eess?searchtype=author&query=Zelasko%2C+P">Piotr Zelasko</a>, <a href="/search/eess?searchtype=author&query=Povey%2C+D">Daniel Povey</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2211.00508v1-abstract-short" style="display: inline;"> Knowledge distillation(KD) is a common approach to improve model performance in automatic speech recognition (ASR), where a student model is trained to imitate the output behaviour of a teacher model. However, traditional KD methods suffer from teacher label storage issue, especially when the training corpora are large. Although on-the-fly teacher label generation tackles this issue, the training… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.00508v1-abstract-full').style.display = 'inline'; document.getElementById('2211.00508v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2211.00508v1-abstract-full" style="display: none;"> Knowledge distillation(KD) is a common approach to improve model performance in automatic speech recognition (ASR), where a student model is trained to imitate the output behaviour of a teacher model. However, traditional KD methods suffer from teacher label storage issue, especially when the training corpora are large. Although on-the-fly teacher label generation tackles this issue, the training speed is significantly slower as the teacher model has to be evaluated every batch. In this paper, we reformulate the generation of teacher label as a codec problem. We propose a novel Multi-codebook Vector Quantization (MVQ) approach that compresses teacher embeddings to codebook indexes (CI). Based on this, a KD training framework (MVQ-KD) is proposed where a student model predicts the CI generated from the embeddings of a self-supervised pre-trained teacher model. Experiments on the LibriSpeech clean-100 hour show that MVQ-KD framework achieves comparable performance as traditional KD methods (l1, l2), while requiring 256 times less storage. When the full LibriSpeech dataset is used, MVQ-KD framework results in 13.8% and 8.2% relative word error rate reductions (WERRs) for non -streaming transducer on test-clean and test-other and 4.0% and 4.9% for streaming transducer. The implementation of this work is already released as a part of the open-source project icefall. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.00508v1-abstract-full').style.display = 'none'; document.getElementById('2211.00508v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to ICASSP 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2211.00490">arXiv:2211.00490</a> <span> [<a href="https://arxiv.org/pdf/2211.00490">pdf</a>, <a href="https://arxiv.org/format/2211.00490">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Delay-penalized transducer for low-latency streaming ASR </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Kang%2C+W">Wei Kang</a>, <a href="/search/eess?searchtype=author&query=Yao%2C+Z">Zengwei Yao</a>, <a href="/search/eess?searchtype=author&query=Kuang%2C+F">Fangjun Kuang</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+L">Liyong Guo</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+X">Xiaoyu Yang</a>, <a href="/search/eess?searchtype=author&query=lin%2C+L">Long lin</a>, <a href="/search/eess?searchtype=author&query=%C5%BBelasko%2C+P">Piotr 呕elasko</a>, <a href="/search/eess?searchtype=author&query=Povey%2C+D">Daniel Povey</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2211.00490v1-abstract-short" style="display: inline;"> In streaming automatic speech recognition (ASR), it is desirable to reduce latency as much as possible while having minimum impact on recognition accuracy. Although a few existing methods are able to achieve this goal, they are difficult to implement due to their dependency on external alignments. In this paper, we propose a simple way to penalize symbol delay in transducer model, so that we can b… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.00490v1-abstract-full').style.display = 'inline'; document.getElementById('2211.00490v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2211.00490v1-abstract-full" style="display: none;"> In streaming automatic speech recognition (ASR), it is desirable to reduce latency as much as possible while having minimum impact on recognition accuracy. Although a few existing methods are able to achieve this goal, they are difficult to implement due to their dependency on external alignments. In this paper, we propose a simple way to penalize symbol delay in transducer model, so that we can balance the trade-off between symbol delay and accuracy for streaming models without external alignments. Specifically, our method adds a small constant times (T/2 - t), where T is the number of frames and t is the current frame, to all the non-blank log-probabilities (after normalization) that are fed into the two dimensional transducer recursion. For both streaming Conformer models and unidirectional long short-term memory (LSTM) models, experimental results show that it can significantly reduce the symbol delay with an acceptable performance degradation. Our method achieves similar delay-accuracy trade-off to the previously published FastEmit, but we believe our method is preferable because it has a better justification: it is equivalent to penalizing the average symbol delay. Our work is open-sourced and publicly available (https://github.com/k2-fsa/k2). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.00490v1-abstract-full').style.display = 'none'; document.getElementById('2211.00490v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to 2023 IEEE International Conference on Acoustics, Speech and Signal Processing</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2211.00484">arXiv:2211.00484</a> <span> [<a href="https://arxiv.org/pdf/2211.00484">pdf</a>, <a href="https://arxiv.org/ps/2211.00484">ps</a>, <a href="https://arxiv.org/format/2211.00484">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Fast and parallel decoding for transducer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Kang%2C+W">Wei Kang</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+L">Liyong Guo</a>, <a href="/search/eess?searchtype=author&query=Kuang%2C+F">Fangjun Kuang</a>, <a href="/search/eess?searchtype=author&query=Lin%2C+L">Long Lin</a>, <a href="/search/eess?searchtype=author&query=Luo%2C+M">Mingshuang Luo</a>, <a href="/search/eess?searchtype=author&query=Yao%2C+Z">Zengwei Yao</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+X">Xiaoyu Yang</a>, <a href="/search/eess?searchtype=author&query=%C5%BBelasko%2C+P">Piotr 呕elasko</a>, <a href="/search/eess?searchtype=author&query=Povey%2C+D">Daniel Povey</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2211.00484v1-abstract-short" style="display: inline;"> The transducer architecture is becoming increasingly popular in the field of speech recognition, because it is naturally streaming as well as high in accuracy. One of the drawbacks of transducer is that it is difficult to decode in a fast and parallel way due to an unconstrained number of symbols that can be emitted per time step. In this work, we introduce a constrained version of transducer loss… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.00484v1-abstract-full').style.display = 'inline'; document.getElementById('2211.00484v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2211.00484v1-abstract-full" style="display: none;"> The transducer architecture is becoming increasingly popular in the field of speech recognition, because it is naturally streaming as well as high in accuracy. One of the drawbacks of transducer is that it is difficult to decode in a fast and parallel way due to an unconstrained number of symbols that can be emitted per time step. In this work, we introduce a constrained version of transducer loss to learn strictly monotonic alignments between the sequences; we also improve the standard greedy search and beam search algorithms by limiting the number of symbols that can be emitted per time step in transducer decoding, making it more efficient to decode in parallel with batches. Furthermore, we propose an finite state automaton-based (FSA) parallel beam search algorithm that can run with graphs on GPU efficiently. The experiment results show that we achieve slight word error rate (WER) improvement as well as significant speedup in decoding. Our work is open-sourced and publicly available\footnote{https://github.com/k2-fsa/icefall}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.00484v1-abstract-full').style.display = 'none'; document.getElementById('2211.00484v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to 2023 IEEE International Conference on Acoustics, Speech and Signal Processing</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2209.01702">arXiv:2209.01702</a> <span> [<a href="https://arxiv.org/pdf/2209.01702">pdf</a>, <a href="https://arxiv.org/format/2209.01702">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Time-domain speech super-resolution with GAN based modeling for telephony speaker verification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Kataria%2C+S">Saurabh Kataria</a>, <a href="/search/eess?searchtype=author&query=Villalba%2C+J">Jes煤s Villalba</a>, <a href="/search/eess?searchtype=author&query=Moro-Vel%C3%A1zquez%2C+L">Laureano Moro-Vel谩zquez</a>, <a href="/search/eess?searchtype=author&query=%C5%BBelasko%2C+P">Piotr 呕elasko</a>, <a href="/search/eess?searchtype=author&query=Dehak%2C+N">Najim Dehak</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2209.01702v1-abstract-short" style="display: inline;"> Automatic Speaker Verification (ASV) technology has become commonplace in virtual assistants. However, its performance suffers when there is a mismatch between the train and test domains. Mixed bandwidth training, i.e., pooling training data from both domains, is a preferred choice for developing a universal model that works for both narrowband and wideband domains. We propose complementing this t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2209.01702v1-abstract-full').style.display = 'inline'; document.getElementById('2209.01702v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2209.01702v1-abstract-full" style="display: none;"> Automatic Speaker Verification (ASV) technology has become commonplace in virtual assistants. However, its performance suffers when there is a mismatch between the train and test domains. Mixed bandwidth training, i.e., pooling training data from both domains, is a preferred choice for developing a universal model that works for both narrowband and wideband domains. We propose complementing this technique by performing neural upsampling of narrowband signals, also known as bandwidth extension. Our main goal is to discover and analyze high-performing time-domain Generative Adversarial Network (GAN) based models to improve our downstream state-of-the-art ASV system. We choose GANs since they (1) are powerful for learning conditional distribution and (2) allow flexible plug-in usage as a pre-processor during the training of downstream task (ASV) with data augmentation. Prior works mainly focus on feature-domain bandwidth extension and limited experimental setups. We address these limitations by 1) using time-domain extension models, 2) reporting results on three real test sets, 2) extending training data, and 3) devising new test-time schemes. We compare supervised (conditional GAN) and unsupervised GANs (CycleGAN) and demonstrate average relative improvement in Equal Error Rate of 8.6% and 7.7%, respectively. For further analysis, we study changes in spectrogram visual quality, audio perceptual quality, t-SNE embeddings, and ASV score distributions. We show that our bandwidth extension leads to phenomena such as a shift of telephone (test) embeddings towards wideband (train) signals, a negative correlation of perceptual quality with downstream performance, and condition-independent score calibration. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2209.01702v1-abstract-full').style.display = 'none'; document.getElementById('2209.01702v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 September, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submit to IEEE/ACM Transactions on Audio, Speech, and Language Processing</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2208.05413">arXiv:2208.05413</a> <span> [<a href="https://arxiv.org/pdf/2208.05413">pdf</a>, <a href="https://arxiv.org/format/2208.05413">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Non-Contrastive Self-Supervised Learning of Utterance-Level Speech Representations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Cho%2C+J">Jaejin Cho</a>, <a href="/search/eess?searchtype=author&query=Pappagari%2C+R">Raghavendra Pappagari</a>, <a href="/search/eess?searchtype=author&query=%C5%BBelasko%2C+P">Piotr 呕elasko</a>, <a href="/search/eess?searchtype=author&query=Moro-Velazquez%2C+L">Laureano Moro-Velazquez</a>, <a href="/search/eess?searchtype=author&query=Villalba%2C+J">Jes煤s Villalba</a>, <a href="/search/eess?searchtype=author&query=Dehak%2C+N">Najim Dehak</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2208.05413v1-abstract-short" style="display: inline;"> Considering the abundance of unlabeled speech data and the high labeling costs, unsupervised learning methods can be essential for better system development. One of the most successful methods is contrastive self-supervised methods, which require negative sampling: sampling alternative samples to contrast with the current sample (anchor). However, it is hard to ensure if all the negative samples b… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2208.05413v1-abstract-full').style.display = 'inline'; document.getElementById('2208.05413v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2208.05413v1-abstract-full" style="display: none;"> Considering the abundance of unlabeled speech data and the high labeling costs, unsupervised learning methods can be essential for better system development. One of the most successful methods is contrastive self-supervised methods, which require negative sampling: sampling alternative samples to contrast with the current sample (anchor). However, it is hard to ensure if all the negative samples belong to classes different from the anchor class without labels. This paper applies a non-contrastive self-supervised learning method on an unlabeled speech corpus to learn utterance-level embeddings. We used DIstillation with NO labels (DINO), proposed in computer vision, and adapted it to the speech domain. Unlike the contrastive methods, DINO does not require negative sampling. These embeddings were evaluated on speaker verification and emotion recognition. In speaker verification, the unsupervised DINO embedding with cosine scoring provided 4.38% EER on the VoxCeleb1 test trial. This outperforms the best contrastive self-supervised method by 40% relative in EER. An iterative pseudo-labeling training pipeline, not requiring speaker labels, further improved the EER to 1.89%. In emotion recognition, the DINO embedding performed 60.87, 79.21, and 56.98% in micro-f1 score on IEMOCAP, Crema-D, and MSP-Podcast, respectively. The results imply the generality of the DINO embedding to different speech applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2208.05413v1-abstract-full').style.display = 'none'; document.getElementById('2208.05413v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 August, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at Interspeech 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2204.03851">arXiv:2204.03851</a> <span> [<a href="https://arxiv.org/pdf/2204.03851">pdf</a>, <a href="https://arxiv.org/format/2204.03851">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Defense against Adversarial Attacks on Hybrid Speech Recognition using Joint Adversarial Fine-tuning with Denoiser </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Joshi%2C+S">Sonal Joshi</a>, <a href="/search/eess?searchtype=author&query=Kataria%2C+S">Saurabh Kataria</a>, <a href="/search/eess?searchtype=author&query=Shao%2C+Y">Yiwen Shao</a>, <a href="/search/eess?searchtype=author&query=Zelasko%2C+P">Piotr Zelasko</a>, <a href="/search/eess?searchtype=author&query=Villalba%2C+J">Jesus Villalba</a>, <a href="/search/eess?searchtype=author&query=Khudanpur%2C+S">Sanjeev Khudanpur</a>, <a href="/search/eess?searchtype=author&query=Dehak%2C+N">Najim Dehak</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2204.03851v1-abstract-short" style="display: inline;"> Adversarial attacks are a threat to automatic speech recognition (ASR) systems, and it becomes imperative to propose defenses to protect them. In this paper, we perform experiments to show that K2 conformer hybrid ASR is strongly affected by white-box adversarial attacks. We propose three defenses--denoiser pre-processor, adversarially fine-tuning ASR model, and adversarially fine-tuning joint mod… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2204.03851v1-abstract-full').style.display = 'inline'; document.getElementById('2204.03851v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2204.03851v1-abstract-full" style="display: none;"> Adversarial attacks are a threat to automatic speech recognition (ASR) systems, and it becomes imperative to propose defenses to protect them. In this paper, we perform experiments to show that K2 conformer hybrid ASR is strongly affected by white-box adversarial attacks. We propose three defenses--denoiser pre-processor, adversarially fine-tuning ASR model, and adversarially fine-tuning joint model of ASR and denoiser. Our evaluation shows denoiser pre-processor (trained on offline adversarial examples) fails to defend against adaptive white-box attacks. However, adversarially fine-tuning the denoiser using a tandem model of denoiser and ASR offers more robustness. We evaluate two variants of this defense--one updating parameters of both models and the second keeping ASR frozen. The joint model offers a mean absolute decrease of 19.3\% ground truth (GT) WER with reference to baseline against fast gradient sign method (FGSM) attacks with different $L_\infty$ norms. The joint model with frozen ASR parameters gives the best defense against projected gradient descent (PGD) with 7 iterations, yielding a mean absolute increase of 22.3\% GT WER with reference to baseline; and against PGD with 500 iterations, yielding a mean absolute decrease of 45.08\% GT WER and an increase of 68.05\% adversarial target WER. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2204.03851v1-abstract-full').style.display = 'none'; document.getElementById('2204.03851v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 April, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to Interspeech 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2201.11207">arXiv:2201.11207</a> <span> [<a href="https://arxiv.org/pdf/2201.11207">pdf</a>, <a href="https://arxiv.org/format/2201.11207">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Discovering Phonetic Inventories with Crosslingual Automatic Speech Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=%C5%BBelasko%2C+P">Piotr 呕elasko</a>, <a href="/search/eess?searchtype=author&query=Feng%2C+S">Siyuan Feng</a>, <a href="/search/eess?searchtype=author&query=Velazquez%2C+L+M">Laureano Moro Velazquez</a>, <a href="/search/eess?searchtype=author&query=Abavisani%2C+A">Ali Abavisani</a>, <a href="/search/eess?searchtype=author&query=Bhati%2C+S">Saurabhchand Bhati</a>, <a href="/search/eess?searchtype=author&query=Scharenborg%2C+O">Odette Scharenborg</a>, <a href="/search/eess?searchtype=author&query=Hasegawa-Johnson%2C+M">Mark Hasegawa-Johnson</a>, <a href="/search/eess?searchtype=author&query=Dehak%2C+N">Najim Dehak</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2201.11207v2-abstract-short" style="display: inline;"> The high cost of data acquisition makes Automatic Speech Recognition (ASR) model training problematic for most existing languages, including languages that do not even have a written script, or for which the phone inventories remain unknown. Past works explored multilingual training, transfer learning, as well as zero-shot learning in order to build ASR systems for these low-resource languages. Wh… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2201.11207v2-abstract-full').style.display = 'inline'; document.getElementById('2201.11207v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2201.11207v2-abstract-full" style="display: none;"> The high cost of data acquisition makes Automatic Speech Recognition (ASR) model training problematic for most existing languages, including languages that do not even have a written script, or for which the phone inventories remain unknown. Past works explored multilingual training, transfer learning, as well as zero-shot learning in order to build ASR systems for these low-resource languages. While it has been shown that the pooling of resources from multiple languages is helpful, we have not yet seen a successful application of an ASR model to a language unseen during training. A crucial step in the adaptation of ASR from seen to unseen languages is the creation of the phone inventory of the unseen language. The ultimate goal of our work is to build the phone inventory of a language unseen during training in an unsupervised way without any knowledge about the language. In this paper, we 1) investigate the influence of different factors (i.e., model architecture, phonotactic model, type of speech representation) on phone recognition in an unknown language; 2) provide an analysis of which phones transfer well across languages and which do not in order to understand the limitations of and areas for further improvement for automatic phone inventory creation; and 3) present different methods to build a phone inventory of an unseen language in an unsupervised way. To that end, we conducted mono-, multi-, and crosslingual experiments on a set of 13 phonetically diverse languages and several in-depth analyses. We found a number of universal phone tokens (IPA symbols) that are well-recognized cross-linguistically. Through a detailed analysis of results, we conclude that unique sounds, similar sounds, and tone languages remain a major challenge for phonetic inventory discovery. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2201.11207v2-abstract-full').style.display = 'none'; document.getElementById('2201.11207v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 January, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 January, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted for publication in Computer Speech and Language</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2110.12561">arXiv:2110.12561</a> <span> [<a href="https://arxiv.org/pdf/2110.12561">pdf</a>, <a href="https://arxiv.org/format/2110.12561">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Lhotse: a speech data representation library for the modern deep learning ecosystem </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=%C5%BBelasko%2C+P">Piotr 呕elasko</a>, <a href="/search/eess?searchtype=author&query=Povey%2C+D">Daniel Povey</a>, <a href="/search/eess?searchtype=author&query=Trmal%2C+J+%22">Jan "Yenda" Trmal</a>, <a href="/search/eess?searchtype=author&query=Khudanpur%2C+S">Sanjeev Khudanpur</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2110.12561v1-abstract-short" style="display: inline;"> Speech data is notoriously difficult to work with due to a variety of codecs, lengths of recordings, and meta-data formats. We present Lhotse, a speech data representation library that draws upon lessons learned from Kaldi speech recognition toolkit and brings its concepts into the modern deep learning ecosystem. Lhotse provides a common JSON description format with corresponding Python classes an… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.12561v1-abstract-full').style.display = 'inline'; document.getElementById('2110.12561v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2110.12561v1-abstract-full" style="display: none;"> Speech data is notoriously difficult to work with due to a variety of codecs, lengths of recordings, and meta-data formats. We present Lhotse, a speech data representation library that draws upon lessons learned from Kaldi speech recognition toolkit and brings its concepts into the modern deep learning ecosystem. Lhotse provides a common JSON description format with corresponding Python classes and data preparation recipes for over 30 popular speech corpora. Various datasets can be easily combined together and re-purposed for different tasks. The library handles multi-channel recordings, long recordings, local and cloud storage, lazy and on-the-fly operations amongst other features. We introduce Cut and CutSet concepts, which simplify common data wrangling tasks for audio and help incorporate acoustic context of speech utterances. Finally, we show how Lhotse leverages PyTorch data API abstractions and adopts them to handle speech data for deep learning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.12561v1-abstract-full').style.display = 'none'; document.getElementById('2110.12561v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 October, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted for presentation at NeurIPS 2021 Data-Centric AI (DCAI) Workshop</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2110.02345">arXiv:2110.02345</a> <span> [<a href="https://arxiv.org/pdf/2110.02345">pdf</a>, <a href="https://arxiv.org/format/2110.02345">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Unsupervised Speech Segmentation and Variable Rate Representation Learning using Segmental Contrastive Predictive Coding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Bhati%2C+S">Saurabhchand Bhati</a>, <a href="/search/eess?searchtype=author&query=Villalba%2C+J">Jes煤s Villalba</a>, <a href="/search/eess?searchtype=author&query=%C5%BBelasko%2C+P">Piotr 呕elasko</a>, <a href="/search/eess?searchtype=author&query=Moro-Velazquez%2C+L">Laureano Moro-Velazquez</a>, <a href="/search/eess?searchtype=author&query=Dehak%2C+N">Najim Dehak</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2110.02345v2-abstract-short" style="display: inline;"> Typically, unsupervised segmentation of speech into the phone and word-like units are treated as separate tasks and are often done via different methods which do not fully leverage the inter-dependence of the two tasks. Here, we unify them and propose a technique that can jointly perform both, showing that these two tasks indeed benefit from each other. Recent attempts employ self-supervised learn… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.02345v2-abstract-full').style.display = 'inline'; document.getElementById('2110.02345v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2110.02345v2-abstract-full" style="display: none;"> Typically, unsupervised segmentation of speech into the phone and word-like units are treated as separate tasks and are often done via different methods which do not fully leverage the inter-dependence of the two tasks. Here, we unify them and propose a technique that can jointly perform both, showing that these two tasks indeed benefit from each other. Recent attempts employ self-supervised learning, such as contrastive predictive coding (CPC), where the next frame is predicted given past context. However, CPC only looks at the audio signal's frame-level structure. We overcome this limitation with a segmental contrastive predictive coding (SCPC) framework to model the signal structure at a higher level, e.g., phone level. A convolutional neural network learns frame-level representation from the raw waveform via noise-contrastive estimation (NCE). A differentiable boundary detector finds variable-length segments, which are then used to optimize a segment encoder via NCE to learn segment representations. The differentiable boundary detector allows us to train frame-level and segment-level encoders jointly. Experiments show that our single model outperforms existing phone and word segmentation methods on TIMIT and Buckeye datasets. We discover that phone class impacts the boundary detection performance, and the boundaries between successive vowels or semivowels are the most difficult to identify. Finally, we use SCPC to extract speech features at the segment level rather than at uniformly spaced frame level (e.g., 10 ms) and produce variable rate representations that change according to the contents of the utterance. We can lower the feature extraction rate from the typical 100 Hz to as low as 14.5 Hz on average while still outperforming the MFCC features on the linear phone classification task. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.02345v2-abstract-full').style.display = 'none'; document.getElementById('2110.02345v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 October, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 October, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">arXiv admin note: substantial text overlap with arXiv:2106.02170</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2109.06112">arXiv:2109.06112</a> <span> [<a href="https://arxiv.org/pdf/2109.06112">pdf</a>, <a href="https://arxiv.org/format/2109.06112">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Beyond Isolated Utterances: Conversational Emotion Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Pappagari%2C+R">Raghavendra Pappagari</a>, <a href="/search/eess?searchtype=author&query=%C5%BBelasko%2C+P">Piotr 呕elasko</a>, <a href="/search/eess?searchtype=author&query=Villalba%2C+J">Jes煤s Villalba</a>, <a href="/search/eess?searchtype=author&query=Moro-Velazquez%2C+L">Laureano Moro-Velazquez</a>, <a href="/search/eess?searchtype=author&query=Dehak%2C+N">Najim Dehak</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2109.06112v1-abstract-short" style="display: inline;"> Speech emotion recognition is the task of recognizing the speaker's emotional state given a recording of their utterance. While most of the current approaches focus on inferring emotion from isolated utterances, we argue that this is not sufficient to achieve conversational emotion recognition (CER) which deals with recognizing emotions in conversations. In this work, we propose several approaches… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2109.06112v1-abstract-full').style.display = 'inline'; document.getElementById('2109.06112v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2109.06112v1-abstract-full" style="display: none;"> Speech emotion recognition is the task of recognizing the speaker's emotional state given a recording of their utterance. While most of the current approaches focus on inferring emotion from isolated utterances, we argue that this is not sufficient to achieve conversational emotion recognition (CER) which deals with recognizing emotions in conversations. In this work, we propose several approaches for CER by treating it as a sequence labeling task. We investigated transformer architecture for CER and, compared it with ResNet-34 and BiLSTM architectures in both contextual and context-less scenarios using IEMOCAP corpus. Based on the inner workings of the self-attention mechanism, we proposed DiverseCatAugment (DCA), an augmentation scheme, which improved the transformer model performance by an absolute 3.3% micro-f1 on conversations and 3.6% on isolated utterances. We further enhanced the performance by introducing an interlocutor-aware transformer model where we learn a dictionary of interlocutor index embeddings to exploit diarized conversations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2109.06112v1-abstract-full').style.display = 'none'; document.getElementById('2109.06112v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 September, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted for ASRU 2021</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2107.04448">arXiv:2107.04448</a> <span> [<a href="https://arxiv.org/pdf/2107.04448">pdf</a>, <a href="https://arxiv.org/format/2107.04448">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Representation Learning to Classify and Detect Adversarial Attacks against Speaker and Speech Recognition Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Villalba%2C+J">Jes煤s Villalba</a>, <a href="/search/eess?searchtype=author&query=Joshi%2C+S">Sonal Joshi</a>, <a href="/search/eess?searchtype=author&query=%C5%BBelasko%2C+P">Piotr 呕elasko</a>, <a href="/search/eess?searchtype=author&query=Dehak%2C+N">Najim Dehak</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2107.04448v1-abstract-short" style="display: inline;"> Adversarial attacks have become a major threat for machine learning applications. There is a growing interest in studying these attacks in the audio domain, e.g, speech and speaker recognition; and find defenses against them. In this work, we focus on using representation learning to classify/detect attacks w.r.t. the attack algorithm, threat model or signal-to-adversarial-noise ratio. We found th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2107.04448v1-abstract-full').style.display = 'inline'; document.getElementById('2107.04448v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2107.04448v1-abstract-full" style="display: none;"> Adversarial attacks have become a major threat for machine learning applications. There is a growing interest in studying these attacks in the audio domain, e.g, speech and speaker recognition; and find defenses against them. In this work, we focus on using representation learning to classify/detect attacks w.r.t. the attack algorithm, threat model or signal-to-adversarial-noise ratio. We found that common attacks in the literature can be classified with accuracies as high as 90%. Also, representations trained to classify attacks against speaker identification can be used also to classify attacks against speaker verification and speech recognition. We also tested an attack verification task, where we need to decide whether two speech utterances contain the same attack. We observed that our models did not generalize well to attack algorithms not included in the attack representation model training. Motivated by this, we evaluated an unknown attack detection task. We were able to detect unknown attacks with equal error rates of about 19%, which is promising. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2107.04448v1-abstract-full').style.display = 'none'; document.getElementById('2107.04448v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 July, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at Interspeech 2021</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2106.02170">arXiv:2106.02170</a> <span> [<a href="https://arxiv.org/pdf/2106.02170">pdf</a>, <a href="https://arxiv.org/format/2106.02170">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Segmental Contrastive Predictive Coding for Unsupervised Word Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Bhati%2C+S">Saurabhchand Bhati</a>, <a href="/search/eess?searchtype=author&query=Villalba%2C+J">Jes煤s Villalba</a>, <a href="/search/eess?searchtype=author&query=%C5%BBelasko%2C+P">Piotr 呕elasko</a>, <a href="/search/eess?searchtype=author&query=Moro-Velazquez%2C+L">Laureano Moro-Velazquez</a>, <a href="/search/eess?searchtype=author&query=Dehak%2C+N">Najim Dehak</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2106.02170v1-abstract-short" style="display: inline;"> Automatic detection of phoneme or word-like units is one of the core objectives in zero-resource speech processing. Recent attempts employ self-supervised training methods, such as contrastive predictive coding (CPC), where the next frame is predicted given past context. However, CPC only looks at the audio signal's frame-level structure. We overcome this limitation with a segmental contrastive pr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2106.02170v1-abstract-full').style.display = 'inline'; document.getElementById('2106.02170v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2106.02170v1-abstract-full" style="display: none;"> Automatic detection of phoneme or word-like units is one of the core objectives in zero-resource speech processing. Recent attempts employ self-supervised training methods, such as contrastive predictive coding (CPC), where the next frame is predicted given past context. However, CPC only looks at the audio signal's frame-level structure. We overcome this limitation with a segmental contrastive predictive coding (SCPC) framework that can model the signal structure at a higher level e.g. at the phoneme level. In this framework, a convolutional neural network learns frame-level representation from the raw waveform via noise-contrastive estimation (NCE). A differentiable boundary detector finds variable-length segments, which are then used to optimize a segment encoder via NCE to learn segment representations. The differentiable boundary detector allows us to train frame-level and segment-level encoders jointly. Typically, phoneme and word segmentation are treated as separate tasks. We unify them and experimentally show that our single model outperforms existing phoneme and word segmentation methods on TIMIT and Buckeye datasets. We analyze the impact of boundary threshold and when is the right time to include the segmental loss in the learning process. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2106.02170v1-abstract-full').style.display = 'none'; document.getElementById('2106.02170v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 June, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2104.11348">arXiv:2104.11348</a> <span> [<a href="https://arxiv.org/pdf/2104.11348">pdf</a>, <a href="https://arxiv.org/format/2104.11348">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.21437/Interspeech.2021-1915">10.21437/Interspeech.2021-1915 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Earnings-21: A Practical Benchmark for ASR in the Wild </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Del+Rio%2C+M">Miguel Del Rio</a>, <a href="/search/eess?searchtype=author&query=Delworth%2C+N">Natalie Delworth</a>, <a href="/search/eess?searchtype=author&query=Westerman%2C+R">Ryan Westerman</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+M">Michelle Huang</a>, <a href="/search/eess?searchtype=author&query=Bhandari%2C+N">Nishchal Bhandari</a>, <a href="/search/eess?searchtype=author&query=Palakapilly%2C+J">Joseph Palakapilly</a>, <a href="/search/eess?searchtype=author&query=McNamara%2C+Q">Quinten McNamara</a>, <a href="/search/eess?searchtype=author&query=Dong%2C+J">Joshua Dong</a>, <a href="/search/eess?searchtype=author&query=Zelasko%2C+P">Piotr Zelasko</a>, <a href="/search/eess?searchtype=author&query=Jette%2C+M">Miguel Jette</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2104.11348v3-abstract-short" style="display: inline;"> Commonly used speech corpora inadequately challenge academic and commercial ASR systems. In particular, speech corpora lack metadata needed for detailed analysis and WER measurement. In response, we present Earnings-21, a 39-hour corpus of earnings calls containing entity-dense speech from nine different financial sectors. This corpus is intended to benchmark ASR systems in the wild with special a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2104.11348v3-abstract-full').style.display = 'inline'; document.getElementById('2104.11348v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2104.11348v3-abstract-full" style="display: none;"> Commonly used speech corpora inadequately challenge academic and commercial ASR systems. In particular, speech corpora lack metadata needed for detailed analysis and WER measurement. In response, we present Earnings-21, a 39-hour corpus of earnings calls containing entity-dense speech from nine different financial sectors. This corpus is intended to benchmark ASR systems in the wild with special attention towards named entity recognition. We benchmark four commercial ASR models, two internal models built with open-source tools, and an open-source LibriSpeech model and discuss their differences in performance on Earnings-21. Using our recently released fstalign tool, we provide a candid analysis of each model's recognition capabilities under different partitions. Our analysis finds that ASR accuracy for certain NER categories is poor, presenting a significant impediment to transcript comprehension and usage. Earnings-21 bridges academic and commercial ASR system evaluation and enables further research on entity modeling and WER on real world audio. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2104.11348v3-abstract-full').style.display = 'none'; document.getElementById('2104.11348v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 June, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 April, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to INTERSPEECH 2021. June 15 2021: Addressing the comments of reviewers and updating the results of our internal ESPNet model. The results do not change our conclusions. April 28th, 2021: We found and resolved an issue in our experimental evaluation that scored the LibriSpeech model at ~20% worse relative WER than the actual WER. The updated results do not affect our conclusions</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2104.01433">arXiv:2104.01433</a> <span> [<a href="https://arxiv.org/pdf/2104.01433">pdf</a>, <a href="https://arxiv.org/format/2104.01433">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Deep Feature CycleGANs: Speaker Identity Preserving Non-parallel Microphone-Telephone Domain Adaptation for Speaker Verification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Kataria%2C+S">Saurabh Kataria</a>, <a href="/search/eess?searchtype=author&query=Villalba%2C+J">Jes煤s Villalba</a>, <a href="/search/eess?searchtype=author&query=%C5%BBelasko%2C+P">Piotr 呕elasko</a>, <a href="/search/eess?searchtype=author&query=Moro-Vel%C3%A1zquez%2C+L">Laureano Moro-Vel谩zquez</a>, <a href="/search/eess?searchtype=author&query=Dehak%2C+N">Najim Dehak</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2104.01433v1-abstract-short" style="display: inline;"> With the increase in the availability of speech from varied domains, it is imperative to use such out-of-domain data to improve existing speech systems. Domain adaptation is a prominent pre-processing approach for this. We investigate it for adapt microphone speech to the telephone domain. Specifically, we explore CycleGAN-based unpaired translation of microphone data to improve the x-vector/speak… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2104.01433v1-abstract-full').style.display = 'inline'; document.getElementById('2104.01433v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2104.01433v1-abstract-full" style="display: none;"> With the increase in the availability of speech from varied domains, it is imperative to use such out-of-domain data to improve existing speech systems. Domain adaptation is a prominent pre-processing approach for this. We investigate it for adapt microphone speech to the telephone domain. Specifically, we explore CycleGAN-based unpaired translation of microphone data to improve the x-vector/speaker embedding network for Telephony Speaker Verification. We first demonstrate the efficacy of this on real challenging data and then, to improve further, we modify the CycleGAN formulation to make the adaptation task-specific. We modify CycleGAN's identity loss, cycle-consistency loss, and adversarial loss to operate in the deep feature space. Deep features of a signal are extracted from an auxiliary (speaker embedding) network and, hence, preserves speaker identity. Our 3D convolution-based Deep Feature Discriminators (DFD) show relative improvements of 5-10% in terms of equal error rate. To dive deeper, we study a challenging scenario of pooling (adapted) microphone and telephone data with data augmentations and telephone codecs. Finally, we highlight the sensitivity of CycleGAN hyper-parameters and introduce a parameter called probability of adaptation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2104.01433v1-abstract-full').style.display = 'none'; document.getElementById('2104.01433v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 April, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2104.00994">arXiv:2104.00994</a> <span> [<a href="https://arxiv.org/pdf/2104.00994">pdf</a>, <a href="https://arxiv.org/format/2104.00994">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Unsupervised Acoustic Unit Discovery by Leveraging a Language-Independent Subword Discriminative Feature Representation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Feng%2C+S">Siyuan Feng</a>, <a href="/search/eess?searchtype=author&query=%C5%BBelasko%2C+P">Piotr 呕elasko</a>, <a href="/search/eess?searchtype=author&query=Moro-Vel%C3%A1zquez%2C+L">Laureano Moro-Vel谩zquez</a>, <a href="/search/eess?searchtype=author&query=Scharenborg%2C+O">Odette Scharenborg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2104.00994v2-abstract-short" style="display: inline;"> This paper tackles automatically discovering phone-like acoustic units (AUD) from unlabeled speech data. Past studies usually proposed single-step approaches. We propose a two-stage approach: the first stage learns a subword-discriminative feature representation and the second stage applies clustering to the learned representation and obtains phone-like clusters as the discovered acoustic units. I… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2104.00994v2-abstract-full').style.display = 'inline'; document.getElementById('2104.00994v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2104.00994v2-abstract-full" style="display: none;"> This paper tackles automatically discovering phone-like acoustic units (AUD) from unlabeled speech data. Past studies usually proposed single-step approaches. We propose a two-stage approach: the first stage learns a subword-discriminative feature representation and the second stage applies clustering to the learned representation and obtains phone-like clusters as the discovered acoustic units. In the first stage, a recently proposed method in the task of unsupervised subword modeling is improved by replacing a monolingual out-of-domain (OOD) ASR system with a multilingual one to create a subword-discriminative representation that is more language-independent. In the second stage, segment-level k-means is adopted, and two methods to represent the variable-length speech segments as fixed-dimension feature vectors are compared. Experiments on a very low-resource Mboshi language corpus show that our approach outperforms state-of-the-art AUD in both normalized mutual information (NMI) and F-score. The multilingual ASR improved upon the monolingual ASR in providing OOD phone labels and in estimating the phone boundaries. A comparison of our systems with and without knowing the ground-truth phone boundaries showed a 16% NMI performance gap, suggesting that the current approach can significantly benefit from improved phone boundary estimation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2104.00994v2-abstract-full').style.display = 'none'; document.getElementById('2104.00994v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 June, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 April, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted for publication in INTERSPEECH 2021</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2103.17122">arXiv:2103.17122</a> <span> [<a href="https://arxiv.org/pdf/2103.17122">pdf</a>, <a href="https://arxiv.org/ps/2103.17122">ps</a>, <a href="https://arxiv.org/format/2103.17122">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Adversarial Attacks and Defenses for Speech Recognition Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=%C5%BBelasko%2C+P">Piotr 呕elasko</a>, <a href="/search/eess?searchtype=author&query=Joshi%2C+S">Sonal Joshi</a>, <a href="/search/eess?searchtype=author&query=Shao%2C+Y">Yiwen Shao</a>, <a href="/search/eess?searchtype=author&query=Villalba%2C+J">Jesus Villalba</a>, <a href="/search/eess?searchtype=author&query=Trmal%2C+J">Jan Trmal</a>, <a href="/search/eess?searchtype=author&query=Dehak%2C+N">Najim Dehak</a>, <a href="/search/eess?searchtype=author&query=Khudanpur%2C+S">Sanjeev Khudanpur</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2103.17122v1-abstract-short" style="display: inline;"> The ubiquitous presence of machine learning systems in our lives necessitates research into their vulnerabilities and appropriate countermeasures. In particular, we investigate the effectiveness of adversarial attacks and defenses against automatic speech recognition (ASR) systems. We select two ASR models - a thoroughly studied DeepSpeech model and a more recent Espresso framework Transformer enc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2103.17122v1-abstract-full').style.display = 'inline'; document.getElementById('2103.17122v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2103.17122v1-abstract-full" style="display: none;"> The ubiquitous presence of machine learning systems in our lives necessitates research into their vulnerabilities and appropriate countermeasures. In particular, we investigate the effectiveness of adversarial attacks and defenses against automatic speech recognition (ASR) systems. We select two ASR models - a thoroughly studied DeepSpeech model and a more recent Espresso framework Transformer encoder-decoder model. We investigate two threat models: a denial-of-service scenario where fast gradient-sign method (FGSM) or weak projected gradient descent (PGD) attacks are used to degrade the model's word error rate (WER); and a targeted scenario where a more potent imperceptible attack forces the system to recognize a specific phrase. We find that the attack transferability across the investigated ASR systems is limited. To defend the model, we use two preprocessing defenses: randomized smoothing and WaveGAN-based vocoder, and find that they significantly improve the model's adversarial robustness. We show that a WaveGAN vocoder can be a useful countermeasure to adversarial attacks on ASR systems - even when it is jointly attacked with the ASR, the target phrases' word error rate is high. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2103.17122v1-abstract-full').style.display = 'none'; document.getElementById('2103.17122v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 March, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This work has been submitted to the IEEE for possible publication</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2101.08909">arXiv:2101.08909</a> <span> [<a href="https://arxiv.org/pdf/2101.08909">pdf</a>, <a href="https://arxiv.org/format/2101.08909">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Study of Pre-processing Defenses against Adversarial Attacks on State-of-the-art Speaker Recognition Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Joshi%2C+S">Sonal Joshi</a>, <a href="/search/eess?searchtype=author&query=Villalba%2C+J">Jes煤s Villalba</a>, <a href="/search/eess?searchtype=author&query=%C5%BBelasko%2C+P">Piotr 呕elasko</a>, <a href="/search/eess?searchtype=author&query=Moro-Vel%C3%A1zquez%2C+L">Laureano Moro-Vel谩zquez</a>, <a href="/search/eess?searchtype=author&query=Dehak%2C+N">Najim Dehak</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2101.08909v2-abstract-short" style="display: inline;"> Adversarial examples to speaker recognition (SR) systems are generated by adding a carefully crafted noise to the speech signal to make the system fail while being imperceptible to humans. Such attacks pose severe security risks, making it vital to deep-dive and understand how much the state-of-the-art SR systems are vulnerable to these attacks. Moreover, it is of greater importance to propose def… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2101.08909v2-abstract-full').style.display = 'inline'; document.getElementById('2101.08909v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2101.08909v2-abstract-full" style="display: none;"> Adversarial examples to speaker recognition (SR) systems are generated by adding a carefully crafted noise to the speech signal to make the system fail while being imperceptible to humans. Such attacks pose severe security risks, making it vital to deep-dive and understand how much the state-of-the-art SR systems are vulnerable to these attacks. Moreover, it is of greater importance to propose defenses that can protect the systems against these attacks. Addressing these concerns, this paper at first investigates how state-of-the-art x-vector based SR systems are affected by white-box adversarial attacks, i.e., when the adversary has full knowledge of the system. x-Vector based SR systems are evaluated against white-box adversarial attacks common in the literature like fast gradient sign method (FGSM), basic iterative method (BIM)--a.k.a. iterative-FGSM--, projected gradient descent (PGD), and Carlini-Wagner (CW) attack. To mitigate against these attacks, the paper proposes four pre-processing defenses. It evaluates them against powerful adaptive white-box adversarial attacks, i.e., when the adversary has full knowledge of the system, including the defense. The four pre-processing defenses--viz. randomized smoothing, DefenseGAN, variational autoencoder (VAE), and Parallel WaveGAN vocoder (PWG) are compared against the baseline defense of adversarial training. Conclusions indicate that SR systems were extremely vulnerable under BIM, PGD, and CW attacks. Among the proposed pre-processing defenses, PWG combined with randomized smoothing offers the most protection against the attacks, with accuracy averaging 93% compared to 52% in the undefended system and an absolute improvement >90% for BIM attacks with $L_\infty>0.001$ and CW attack. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2101.08909v2-abstract-full').style.display = 'none'; document.getElementById('2101.08909v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 June, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 January, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This work has been submitted to the IEEE for possible publication</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2011.01210">arXiv:2011.01210</a> <span> [<a href="https://arxiv.org/pdf/2011.01210">pdf</a>, <a href="https://arxiv.org/format/2011.01210">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Focus on the present: a regularization method for the ASR source-target attention layer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Chen%2C+N">Nanxin Chen</a>, <a href="/search/eess?searchtype=author&query=%C5%BBelasko%2C+P">Piotr 呕elasko</a>, <a href="/search/eess?searchtype=author&query=Villalba%2C+J">Jes煤s Villalba</a>, <a href="/search/eess?searchtype=author&query=Dehak%2C+N">Najim Dehak</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2011.01210v1-abstract-short" style="display: inline;"> This paper introduces a novel method to diagnose the source-target attention in state-of-the-art end-to-end speech recognition models with joint connectionist temporal classification (CTC) and attention training. Our method is based on the fact that both, CTC and source-target attention, are acting on the same encoder representations. To understand the functionality of the attention, CTC is applie… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2011.01210v1-abstract-full').style.display = 'inline'; document.getElementById('2011.01210v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2011.01210v1-abstract-full" style="display: none;"> This paper introduces a novel method to diagnose the source-target attention in state-of-the-art end-to-end speech recognition models with joint connectionist temporal classification (CTC) and attention training. Our method is based on the fact that both, CTC and source-target attention, are acting on the same encoder representations. To understand the functionality of the attention, CTC is applied to compute the token posteriors given the attention outputs. We found that the source-target attention heads are able to predict several tokens ahead of the current one. Inspired by the observation, a new regularization method is proposed which leverages CTC to make source-target attention more focused on the frames corresponding to the output token being predicted by the decoder. Experiments reveal stable improvements up to 7\% and 13\% relatively with the proposed regularization on TED-LIUM 2 and LibriSpeech. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2011.01210v1-abstract-full').style.display = 'none'; document.getElementById('2011.01210v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 November, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">submitted to ICASSP2021. Personal use of this material is permitted. Permission from IEEE must be obtained for all other uses, in any current or future media, including reprinting/republishing this material for advertising or promotional purposes, creating new collective works, for resale or redistribution to servers or lists, or reuse of any copyrighted component of this work in other works</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2010.14602">arXiv:2010.14602</a> <span> [<a href="https://arxiv.org/pdf/2010.14602">pdf</a>, <a href="https://arxiv.org/ps/2010.14602">ps</a>, <a href="https://arxiv.org/format/2010.14602">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> CopyPaste: An Augmentation Method for Speech Emotion Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Pappagari%2C+R">Raghavendra Pappagari</a>, <a href="/search/eess?searchtype=author&query=Villalba%2C+J">Jes煤s Villalba</a>, <a href="/search/eess?searchtype=author&query=%C5%BBelasko%2C+P">Piotr 呕elasko</a>, <a href="/search/eess?searchtype=author&query=Moro-Velazquez%2C+L">Laureano Moro-Velazquez</a>, <a href="/search/eess?searchtype=author&query=Dehak%2C+N">Najim Dehak</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2010.14602v2-abstract-short" style="display: inline;"> Data augmentation is a widely used strategy for training robust machine learning models. It partially alleviates the problem of limited data for tasks like speech emotion recognition (SER), where collecting data is expensive and challenging. This study proposes CopyPaste, a perceptually motivated novel augmentation procedure for SER. Assuming that the presence of emotions other than neutral dictat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2010.14602v2-abstract-full').style.display = 'inline'; document.getElementById('2010.14602v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2010.14602v2-abstract-full" style="display: none;"> Data augmentation is a widely used strategy for training robust machine learning models. It partially alleviates the problem of limited data for tasks like speech emotion recognition (SER), where collecting data is expensive and challenging. This study proposes CopyPaste, a perceptually motivated novel augmentation procedure for SER. Assuming that the presence of emotions other than neutral dictates a speaker's overall perceived emotion in a recording, concatenation of an emotional (emotion E) and a neutral utterance can still be labeled with emotion E. We hypothesize that SER performance can be improved using these concatenated utterances in model training. To verify this, three CopyPaste schemes are tested on two deep learning models: one trained independently and another using transfer learning from an x-vector model, a speaker recognition model. We observed that all three CopyPaste schemes improve SER performance on all the three datasets considered: MSP-Podcast, Crema-D, and IEMOCAP. Additionally, CopyPaste performs better than noise augmentation and, using them together improves the SER performance further. Our experiments on noisy test sets suggested that CopyPaste is effective even in noisy test conditions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2010.14602v2-abstract-full').style.display = 'none'; document.getElementById('2010.14602v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 October, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at ICASSP2021</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2010.12104">arXiv:2010.12104</a> <span> [<a href="https://arxiv.org/pdf/2010.12104">pdf</a>, <a href="https://arxiv.org/format/2010.12104">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/ICASSP39728.2021.9414478">10.1109/ICASSP39728.2021.9414478 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> How Phonotactics Affect Multilingual and Zero-shot ASR Performance </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Feng%2C+S">Siyuan Feng</a>, <a href="/search/eess?searchtype=author&query=%C5%BBelasko%2C+P">Piotr 呕elasko</a>, <a href="/search/eess?searchtype=author&query=Moro-Vel%C3%A1zquez%2C+L">Laureano Moro-Vel谩zquez</a>, <a href="/search/eess?searchtype=author&query=Abavisani%2C+A">Ali Abavisani</a>, <a href="/search/eess?searchtype=author&query=Hasegawa-Johnson%2C+M">Mark Hasegawa-Johnson</a>, <a href="/search/eess?searchtype=author&query=Scharenborg%2C+O">Odette Scharenborg</a>, <a href="/search/eess?searchtype=author&query=Dehak%2C+N">Najim Dehak</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2010.12104v2-abstract-short" style="display: inline;"> The idea of combining multiple languages' recordings to train a single automatic speech recognition (ASR) model brings the promise of the emergence of universal speech representation. Recently, a Transformer encoder-decoder model has been shown to leverage multilingual data well in IPA transcriptions of languages presented during training. However, the representations it learned were not successfu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2010.12104v2-abstract-full').style.display = 'inline'; document.getElementById('2010.12104v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2010.12104v2-abstract-full" style="display: none;"> The idea of combining multiple languages' recordings to train a single automatic speech recognition (ASR) model brings the promise of the emergence of universal speech representation. Recently, a Transformer encoder-decoder model has been shown to leverage multilingual data well in IPA transcriptions of languages presented during training. However, the representations it learned were not successful in zero-shot transfer to unseen languages. Because that model lacks an explicit factorization of the acoustic model (AM) and language model (LM), it is unclear to what degree the performance suffered from differences in pronunciation or the mismatch in phonotactics. To gain more insight into the factors limiting zero-shot ASR transfer, we replace the encoder-decoder with a hybrid ASR system consisting of a separate AM and LM. Then, we perform an extensive evaluation of monolingual, multilingual, and crosslingual (zero-shot) acoustic and language models on a set of 13 phonetically diverse languages. We show that the gain from modeling crosslingual phonotactics is limited, and imposing a too strong model can hurt the zero-shot transfer. Furthermore, we find that a multilingual LM hurts a multilingual ASR system's performance, and retaining only the target language's phonotactic data in LM training is preferable. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2010.12104v2-abstract-full').style.display = 'none'; document.getElementById('2010.12104v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 October, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted for publication in IEEE ICASSP 2021. The first 2 authors contributed equally to this work</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2010.11221">arXiv:2010.11221</a> <span> [<a href="https://arxiv.org/pdf/2010.11221">pdf</a>, <a href="https://arxiv.org/format/2010.11221">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Learning Speaker Embedding from Text-to-Speech </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Cho%2C+J">Jaejin Cho</a>, <a href="/search/eess?searchtype=author&query=Zelasko%2C+P">Piotr Zelasko</a>, <a href="/search/eess?searchtype=author&query=Villalba%2C+J">Jesus Villalba</a>, <a href="/search/eess?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a>, <a href="/search/eess?searchtype=author&query=Dehak%2C+N">Najim Dehak</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2010.11221v1-abstract-short" style="display: inline;"> Zero-shot multi-speaker Text-to-Speech (TTS) generates target speaker voices given an input text and the corresponding speaker embedding. In this work, we investigate the effectiveness of the TTS reconstruction objective to improve representation learning for speaker verification. We jointly trained end-to-end Tacotron 2 TTS and speaker embedding networks in a self-supervised fashion. We hypothesi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2010.11221v1-abstract-full').style.display = 'inline'; document.getElementById('2010.11221v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2010.11221v1-abstract-full" style="display: none;"> Zero-shot multi-speaker Text-to-Speech (TTS) generates target speaker voices given an input text and the corresponding speaker embedding. In this work, we investigate the effectiveness of the TTS reconstruction objective to improve representation learning for speaker verification. We jointly trained end-to-end Tacotron 2 TTS and speaker embedding networks in a self-supervised fashion. We hypothesize that the embeddings will contain minimal phonetic information since the TTS decoder will obtain that information from the textual input. TTS reconstruction can also be combined with speaker classification to enhance these embeddings further. Once trained, the speaker encoder computes representations for the speaker verification task, while the rest of the TTS blocks are discarded. We investigated training TTS from either manual or ASR-generated transcripts. The latter allows us to train embeddings on datasets without manual transcripts. We compared ASR transcripts and Kaldi phone alignments as TTS inputs, showing that the latter performed better due to their finer resolution. Unsupervised TTS embeddings improved EER by 2.06\% absolute with regard to i-vectors for the LibriTTS dataset. TTS with speaker classification loss improved EER by 0.28\% and 0.73\% absolutely from a model using only speaker classification loss in LibriTTS and Voxceleb1 respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2010.11221v1-abstract-full').style.display = 'none'; document.getElementById('2010.11221v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2020. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2010.03432">arXiv:2010.03432</a> <span> [<a href="https://arxiv.org/pdf/2010.03432">pdf</a>, <a href="https://arxiv.org/format/2010.03432">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> WER we are and WER we think we are </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Szyma%C5%84ski%2C+P">Piotr Szyma艅ski</a>, <a href="/search/eess?searchtype=author&query=%C5%BBelasko%2C+P">Piotr 呕elasko</a>, <a href="/search/eess?searchtype=author&query=Morzy%2C+M">Mikolaj Morzy</a>, <a href="/search/eess?searchtype=author&query=Szymczak%2C+A">Adrian Szymczak</a>, <a href="/search/eess?searchtype=author&query=%C5%BBy%C5%82a-Hoppe%2C+M">Marzena 呕y艂a-Hoppe</a>, <a href="/search/eess?searchtype=author&query=Banaszczak%2C+J">Joanna Banaszczak</a>, <a href="/search/eess?searchtype=author&query=Augustyniak%2C+L">Lukasz Augustyniak</a>, <a href="/search/eess?searchtype=author&query=Mizgajski%2C+J">Jan Mizgajski</a>, <a href="/search/eess?searchtype=author&query=Carmiel%2C+Y">Yishay Carmiel</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2010.03432v1-abstract-short" style="display: inline;"> Natural language processing of conversational speech requires the availability of high-quality transcripts. In this paper, we express our skepticism towards the recent reports of very low Word Error Rates (WERs) achieved by modern Automatic Speech Recognition (ASR) systems on benchmark datasets. We outline several problems with popular benchmarks and compare three state-of-the-art commercial ASR s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2010.03432v1-abstract-full').style.display = 'inline'; document.getElementById('2010.03432v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2010.03432v1-abstract-full" style="display: none;"> Natural language processing of conversational speech requires the availability of high-quality transcripts. In this paper, we express our skepticism towards the recent reports of very low Word Error Rates (WERs) achieved by modern Automatic Speech Recognition (ASR) systems on benchmark datasets. We outline several problems with popular benchmarks and compare three state-of-the-art commercial ASR systems on an internal dataset of real-life spontaneous human conversations and HUB'05 public benchmark. We show that WERs are significantly higher than the best reported results. We formulate a set of guidelines which may aid in the creation of real-life, multi-domain datasets with high quality annotations for training and testing of robust ASR systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2010.03432v1-abstract-full').style.display = 'none'; document.getElementById('2010.03432v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 October, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to EMNLP Findings</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2007.13033">arXiv:2007.13033</a> <span> [<a href="https://arxiv.org/pdf/2007.13033">pdf</a>, <a href="https://arxiv.org/format/2007.13033">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Self-Expressing Autoencoders for Unsupervised Spoken Term Discovery </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Bhati%2C+S">Saurabhchand Bhati</a>, <a href="/search/eess?searchtype=author&query=Villalba%2C+J">Jes煤s Villalba</a>, <a href="/search/eess?searchtype=author&query=%C5%BBelasko%2C+P">Piotr 呕elasko</a>, <a href="/search/eess?searchtype=author&query=Dehak%2C+N">Najim Dehak</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2007.13033v1-abstract-short" style="display: inline;"> Unsupervised spoken term discovery consists of two tasks: finding the acoustic segment boundaries and labeling acoustically similar segments with the same labels. We perform segmentation based on the assumption that the frame feature vectors are more similar within a segment than across the segments. Therefore, for strong segmentation performance, it is crucial that the features represent the phon… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2007.13033v1-abstract-full').style.display = 'inline'; document.getElementById('2007.13033v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2007.13033v1-abstract-full" style="display: none;"> Unsupervised spoken term discovery consists of two tasks: finding the acoustic segment boundaries and labeling acoustically similar segments with the same labels. We perform segmentation based on the assumption that the frame feature vectors are more similar within a segment than across the segments. Therefore, for strong segmentation performance, it is crucial that the features represent the phonetic properties of a frame more than other factors of variability. We achieve this via a self-expressing autoencoder framework. It consists of a single encoder and two decoders with shared weights. The encoder projects the input features into a latent representation. One of the decoders tries to reconstruct the input from these latent representations and the other from the self-expressed version of them. We use the obtained features to segment and cluster the speech data. We evaluate the performance of the proposed method in the Zero Resource 2020 challenge unit discovery task. The proposed system consistently outperforms the baseline, demonstrating the usefulness of the method in learning representations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2007.13033v1-abstract-full').style.display = 'none'; document.getElementById('2007.13033v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 July, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2020. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2006.07898">arXiv:2006.07898</a> <span> [<a href="https://arxiv.org/pdf/2006.07898">pdf</a>, <a href="https://arxiv.org/format/2006.07898">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> The JHU Multi-Microphone Multi-Speaker ASR System for the CHiME-6 Challenge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Arora%2C+A">Ashish Arora</a>, <a href="/search/eess?searchtype=author&query=Raj%2C+D">Desh Raj</a>, <a href="/search/eess?searchtype=author&query=Subramanian%2C+A+S">Aswin Shanmugam Subramanian</a>, <a href="/search/eess?searchtype=author&query=Li%2C+K">Ke Li</a>, <a href="/search/eess?searchtype=author&query=Ben-Yair%2C+B">Bar Ben-Yair</a>, <a href="/search/eess?searchtype=author&query=Maciejewski%2C+M">Matthew Maciejewski</a>, <a href="/search/eess?searchtype=author&query=%C5%BBelasko%2C+P">Piotr 呕elasko</a>, <a href="/search/eess?searchtype=author&query=Garc%C3%ADa%2C+P">Paola Garc铆a</a>, <a href="/search/eess?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a>, <a href="/search/eess?searchtype=author&query=Khudanpur%2C+S">Sanjeev Khudanpur</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2006.07898v1-abstract-short" style="display: inline;"> This paper summarizes the JHU team's efforts in tracks 1 and 2 of the CHiME-6 challenge for distant multi-microphone conversational speech diarization and recognition in everyday home environments. We explore multi-array processing techniques at each stage of the pipeline, such as multi-array guided source separation (GSS) for enhancement and acoustic model training data, posterior fusion for spee… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2006.07898v1-abstract-full').style.display = 'inline'; document.getElementById('2006.07898v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2006.07898v1-abstract-full" style="display: none;"> This paper summarizes the JHU team's efforts in tracks 1 and 2 of the CHiME-6 challenge for distant multi-microphone conversational speech diarization and recognition in everyday home environments. We explore multi-array processing techniques at each stage of the pipeline, such as multi-array guided source separation (GSS) for enhancement and acoustic model training data, posterior fusion for speech activity detection, PLDA score fusion for diarization, and lattice combination for automatic speech recognition (ASR). We also report results with different acoustic model architectures, and integrate other techniques such as online multi-channel weighted prediction error (WPE) dereverberation and variational Bayes-hidden Markov model (VB-HMM) based overlap assignment to deal with reverberation and overlapping speakers, respectively. As a result of these efforts, our ASR systems achieve a word error rate of 40.5% and 67.5% on tracks 1 and 2, respectively, on the evaluation set. This is an improvement of 10.8% and 10.4% absolute, over the challenge baselines for the respective tracks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2006.07898v1-abstract-full').style.display = 'none'; document.getElementById('2006.07898v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 June, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Presented at the CHiME-6 workshop (colocated with ICASSP 2020)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2005.08118">arXiv:2005.08118</a> <span> [<a href="https://arxiv.org/pdf/2005.08118">pdf</a>, <a href="https://arxiv.org/format/2005.08118">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> That Sounds Familiar: an Analysis of Phonetic Representations Transfer Across Languages </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=%C5%BBelasko%2C+P">Piotr 呕elasko</a>, <a href="/search/eess?searchtype=author&query=Moro-Vel%C3%A1zquez%2C+L">Laureano Moro-Vel谩zquez</a>, <a href="/search/eess?searchtype=author&query=Hasegawa-Johnson%2C+M">Mark Hasegawa-Johnson</a>, <a href="/search/eess?searchtype=author&query=Scharenborg%2C+O">Odette Scharenborg</a>, <a href="/search/eess?searchtype=author&query=Dehak%2C+N">Najim Dehak</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2005.08118v1-abstract-short" style="display: inline;"> Only a handful of the world's languages are abundant with the resources that enable practical applications of speech processing technologies. One of the methods to overcome this problem is to use the resources existing in other languages to train a multilingual automatic speech recognition (ASR) model, which, intuitively, should learn some universal phonetic representations. In this work, we focus… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2005.08118v1-abstract-full').style.display = 'inline'; document.getElementById('2005.08118v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2005.08118v1-abstract-full" style="display: none;"> Only a handful of the world's languages are abundant with the resources that enable practical applications of speech processing technologies. One of the methods to overcome this problem is to use the resources existing in other languages to train a multilingual automatic speech recognition (ASR) model, which, intuitively, should learn some universal phonetic representations. In this work, we focus on gaining a deeper understanding of how general these representations might be, and how individual phones are getting improved in a multilingual setting. To that end, we select a phonetically diverse set of languages, and perform a series of monolingual, multilingual and crosslingual (zero-shot) experiments. The ASR is trained to recognize the International Phonetic Alphabet (IPA) token sequences. We observe significant improvements across all languages in the multilingual setting, and stark degradation in the crosslingual setting, where the model, among other errors, considers Javanese as a tone language. Notably, as little as 10 hours of the target language training data tremendously reduces ASR error rates. Our analysis uncovered that even the phones that are unique to a single language can benefit greatly from adding training data from other languages - an encouraging result for the low-resource speech community. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2005.08118v1-abstract-full').style.display = 'none'; document.getElementById('2005.08118v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 May, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to Interspeech 2020. For some reason, the ArXiv Latex engine rendered it in more than 4 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2004.05985">arXiv:2004.05985</a> <span> [<a href="https://arxiv.org/pdf/2004.05985">pdf</a>, <a href="https://arxiv.org/ps/2004.05985">ps</a>, <a href="https://arxiv.org/format/2004.05985">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Punctuation Prediction in Spontaneous Conversations: Can We Mitigate ASR Errors with Retrofitted Word Embeddings? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Augustyniak%2C+%C5%81">艁ukasz Augustyniak</a>, <a href="/search/eess?searchtype=author&query=Szymanski%2C+P">Piotr Szymanski</a>, <a href="/search/eess?searchtype=author&query=Morzy%2C+M">Miko艂aj Morzy</a>, <a href="/search/eess?searchtype=author&query=Zelasko%2C+P">Piotr Zelasko</a>, <a href="/search/eess?searchtype=author&query=Szymczak%2C+A">Adrian Szymczak</a>, <a href="/search/eess?searchtype=author&query=Mizgajski%2C+J">Jan Mizgajski</a>, <a href="/search/eess?searchtype=author&query=Carmiel%2C+Y">Yishay Carmiel</a>, <a href="/search/eess?searchtype=author&query=Dehak%2C+N">Najim Dehak</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2004.05985v1-abstract-short" style="display: inline;"> Automatic Speech Recognition (ASR) systems introduce word errors, which often confuse punctuation prediction models, turning punctuation restoration into a challenging task. These errors usually take the form of homonyms. We show how retrofitting of the word embeddings on the domain-specific data can mitigate ASR errors. Our main contribution is a method for better alignment of homonym embeddings… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2004.05985v1-abstract-full').style.display = 'inline'; document.getElementById('2004.05985v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2004.05985v1-abstract-full" style="display: none;"> Automatic Speech Recognition (ASR) systems introduce word errors, which often confuse punctuation prediction models, turning punctuation restoration into a challenging task. These errors usually take the form of homonyms. We show how retrofitting of the word embeddings on the domain-specific data can mitigate ASR errors. Our main contribution is a method for better alignment of homonym embeddings and the validation of the presented method on the punctuation prediction task. We record the absolute improvement in punctuation prediction accuracy between 6.2% (for question marks) to 9% (for periods) when compared with the state-of-the-art model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2004.05985v1-abstract-full').style.display = 'none'; document.getElementById('2004.05985v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 April, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">submitted to INTERSPEECH'20</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1909.02851">arXiv:1909.02851</a> <span> [<a href="https://arxiv.org/pdf/1909.02851">pdf</a>, <a href="https://arxiv.org/format/1909.02851">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Avaya Conversational Intelligence: A Real-Time System for Spoken Language Understanding in Human-Human Call Center Conversations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Mizgajski%2C+J">Jan Mizgajski</a>, <a href="/search/eess?searchtype=author&query=Szymczak%2C+A">Adrian Szymczak</a>, <a href="/search/eess?searchtype=author&query=G%C5%82owski%2C+R">Robert G艂owski</a>, <a href="/search/eess?searchtype=author&query=Szyma%C5%84ski%2C+P">Piotr Szyma艅ski</a>, <a href="/search/eess?searchtype=author&query=%C5%BBelasko%2C+P">Piotr 呕elasko</a>, <a href="/search/eess?searchtype=author&query=Augustyniak%2C+%C5%81">艁ukasz Augustyniak</a>, <a href="/search/eess?searchtype=author&query=Morzy%2C+M">Miko艂aj Morzy</a>, <a href="/search/eess?searchtype=author&query=Carmiel%2C+Y">Yishay Carmiel</a>, <a href="/search/eess?searchtype=author&query=Hodson%2C+J">Jeff Hodson</a>, <a href="/search/eess?searchtype=author&query=W%C3%B3jciak%2C+%C5%81">艁ukasz W贸jciak</a>, <a href="/search/eess?searchtype=author&query=Smoczyk%2C+D">Daniel Smoczyk</a>, <a href="/search/eess?searchtype=author&query=Wr%C3%B3bel%2C+A">Adam Wr贸bel</a>, <a href="/search/eess?searchtype=author&query=Borowik%2C+B">Bartosz Borowik</a>, <a href="/search/eess?searchtype=author&query=Artajew%2C+A">Adam Artajew</a>, <a href="/search/eess?searchtype=author&query=Baran%2C+M">Marcin Baran</a>, <a href="/search/eess?searchtype=author&query=Kwiatkowski%2C+C">Cezary Kwiatkowski</a>, <a href="/search/eess?searchtype=author&query=%C5%BBy%C5%82a-Hoppe%2C+M">Marzena 呕y艂a-Hoppe</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1909.02851v1-abstract-short" style="display: inline;"> Avaya Conversational Intelligence(ACI) is an end-to-end, cloud-based solution for real-time Spoken Language Understanding for call centers. It combines large vocabulary, real-time speech recognition, transcript refinement, and entity and intent recognition in order to convert live audio into a rich, actionable stream of structured events. These events can be further leveraged with a business rules… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1909.02851v1-abstract-full').style.display = 'inline'; document.getElementById('1909.02851v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1909.02851v1-abstract-full" style="display: none;"> Avaya Conversational Intelligence(ACI) is an end-to-end, cloud-based solution for real-time Spoken Language Understanding for call centers. It combines large vocabulary, real-time speech recognition, transcript refinement, and entity and intent recognition in order to convert live audio into a rich, actionable stream of structured events. These events can be further leveraged with a business rules engine, thus serving as a foundation for real-time supervision and assistance applications. After the ingestion, calls are enriched with unsupervised keyword extraction, abstractive summarization, and business-defined attributes, enabling offline use cases, such as business intelligence, topic mining, full-text search, quality assurance, and agent training. ACI comes with a pretrained, configurable library of hundreds of intents and a robust intent training environment that allows for efficient, cost-effective creation and customization of customer-specific intents. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1909.02851v1-abstract-full').style.display = 'none'; document.getElementById('1909.02851v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 September, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted for Interspeech 2019</span> </p> </li> </ol> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository