Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–35 of 35 results for author: <span class="mathjax">Fuegen, C</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Fuegen%2C+C">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Fuegen, C"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Fuegen%2C+C&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Fuegen, C"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.01716">arXiv:2404.01716</a> <span> [<a href="https://arxiv.org/pdf/2404.01716">pdf</a>, <a href="https://arxiv.org/format/2404.01716">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Effective internal language model training and fusion for factorized transducer model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+J">Jinxi Guo</a>, <a href="/search/cs?searchtype=author&query=Moritz%2C+N">Niko Moritz</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Y">Yingyi Ma</a>, <a href="/search/cs?searchtype=author&query=Seide%2C+F">Frank Seide</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+C">Chunyang Wu</a>, <a href="/search/cs?searchtype=author&query=Mahadeokar%2C+J">Jay Mahadeokar</a>, <a href="/search/cs?searchtype=author&query=Kalinli%2C+O">Ozlem Kalinli</a>, <a href="/search/cs?searchtype=author&query=Fuegen%2C+C">Christian Fuegen</a>, <a href="/search/cs?searchtype=author&query=Seltzer%2C+M">Mike Seltzer</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.01716v1-abstract-short" style="display: inline;"> The internal language model (ILM) of the neural transducer has been widely studied. In most prior work, it is mainly used for estimating the ILM score and is subsequently subtracted during inference to facilitate improved integration with external language models. Recently, various of factorized transducer models have been proposed, which explicitly embrace a standalone internal language model for… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.01716v1-abstract-full').style.display = 'inline'; document.getElementById('2404.01716v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.01716v1-abstract-full" style="display: none;"> The internal language model (ILM) of the neural transducer has been widely studied. In most prior work, it is mainly used for estimating the ILM score and is subsequently subtracted during inference to facilitate improved integration with external language models. Recently, various of factorized transducer models have been proposed, which explicitly embrace a standalone internal language model for non-blank token prediction. However, even with the adoption of factorized transducer models, limited improvement has been observed compared to shallow fusion. In this paper, we propose a novel ILM training and decoding strategy for factorized transducer models, which effectively combines the blank, acoustic and ILM scores. Our experiments show a 17% relative improvement over the standard decoding method when utilizing a well-trained ILM and the proposed decoding strategy on LibriSpeech datasets. Furthermore, when compared to a strong RNN-T baseline enhanced with external LM fusion, the proposed model yields a 5.5% relative improvement on general-sets and an 8.9% WER reduction for rare words. The proposed model can achieve superior performance without relying on external language models, rendering it highly efficient for production use-cases. To further improve the performance, we propose a novel and memory-efficient ILM-fusion-aware minimum word error rate (MWER) training method which improves ILM integration significantly. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.01716v1-abstract-full').style.display = 'none'; document.getElementById('2404.01716v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ICASSP 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.10411">arXiv:2401.10411</a> <span> [<a href="https://arxiv.org/pdf/2401.10411">pdf</a>, <a href="https://arxiv.org/format/2401.10411">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> AGADIR: Towards Array-Geometry Agnostic Directional Speech Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+J">Ju Lin</a>, <a href="/search/cs?searchtype=author&query=Moritz%2C+N">Niko Moritz</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yiteng Huang</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+R">Ruiming Xie</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+M">Ming Sun</a>, <a href="/search/cs?searchtype=author&query=Fuegen%2C+C">Christian Fuegen</a>, <a href="/search/cs?searchtype=author&query=Seide%2C+F">Frank Seide</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.10411v1-abstract-short" style="display: inline;"> Wearable devices like smart glasses are approaching the compute capability to seamlessly generate real-time closed captions for live conversations. We build on our recently introduced directional Automatic Speech Recognition (ASR) for smart glasses that have microphone arrays, which fuses multi-channel ASR with serialized output training, for wearer/conversation-partner disambiguation as well as s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.10411v1-abstract-full').style.display = 'inline'; document.getElementById('2401.10411v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.10411v1-abstract-full" style="display: none;"> Wearable devices like smart glasses are approaching the compute capability to seamlessly generate real-time closed captions for live conversations. We build on our recently introduced directional Automatic Speech Recognition (ASR) for smart glasses that have microphone arrays, which fuses multi-channel ASR with serialized output training, for wearer/conversation-partner disambiguation as well as suppression of cross-talk speech from non-target directions and noise. When ASR work is part of a broader system-development process, one may be faced with changes to microphone geometries as system development progresses. This paper aims to make multi-channel ASR insensitive to limited variations of microphone-array geometry. We show that a model trained on multiple similar geometries is largely agnostic and generalizes well to new geometries, as long as they are not too different. Furthermore, training the model this way improves accuracy for seen geometries by 15 to 28\% relative. Lastly, we refine the beamforming by a novel Non-Linearly Constrained Minimum Variance criterion. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.10411v1-abstract-full').style.display = 'none'; document.getElementById('2401.10411v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ICASSP 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.06753">arXiv:2311.06753</a> <span> [<a href="https://arxiv.org/pdf/2311.06753">pdf</a>, <a href="https://arxiv.org/format/2311.06753">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> AudioChatLlama: Towards General-Purpose Speech Abilities for LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fathullah%2C+Y">Yassir Fathullah</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+C">Chunyang Wu</a>, <a href="/search/cs?searchtype=author&query=Lakomkin%2C+E">Egor Lakomkin</a>, <a href="/search/cs?searchtype=author&query=Li%2C+K">Ke Li</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+J">Junteng Jia</a>, <a href="/search/cs?searchtype=author&query=Shangguan%2C+Y">Yuan Shangguan</a>, <a href="/search/cs?searchtype=author&query=Mahadeokar%2C+J">Jay Mahadeokar</a>, <a href="/search/cs?searchtype=author&query=Kalinli%2C+O">Ozlem Kalinli</a>, <a href="/search/cs?searchtype=author&query=Fuegen%2C+C">Christian Fuegen</a>, <a href="/search/cs?searchtype=author&query=Seltzer%2C+M">Mike Seltzer</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.06753v2-abstract-short" style="display: inline;"> In this work, we extend the instruction-tuned Llama-2 model with end-to-end general-purpose speech processing and reasoning abilities while maintaining the wide range of original LLM capabilities, without using any carefully curated paired data. The resulting end-to-end model, named AudioChatLlama, can utilize audio prompts as a replacement for text and sustain a conversation. Such a model also ha… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.06753v2-abstract-full').style.display = 'inline'; document.getElementById('2311.06753v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.06753v2-abstract-full" style="display: none;"> In this work, we extend the instruction-tuned Llama-2 model with end-to-end general-purpose speech processing and reasoning abilities while maintaining the wide range of original LLM capabilities, without using any carefully curated paired data. The resulting end-to-end model, named AudioChatLlama, can utilize audio prompts as a replacement for text and sustain a conversation. Such a model also has extended cross-modal capabilities such as being able to perform spoken question answering (QA), speech translation, and audio summarization amongst many other closed and open-domain tasks. This is unlike prior approaches in speech, in which LLMs are extended to handle audio for a limited number of pre-designated tasks. On both synthesized and recorded speech QA test sets, evaluations show that our end-to-end approach is on par with or outperforms cascaded systems (speech recognizer + LLM) in terms of modeling the response to a prompt. Furthermore, unlike cascades, our approach can interchange text and audio modalities and intrinsically utilize prior context in a conversation to provide better results. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.06753v2-abstract-full').style.display = 'none'; document.getElementById('2311.06753v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.10917">arXiv:2309.10917</a> <span> [<a href="https://arxiv.org/pdf/2309.10917">pdf</a>, <a href="https://arxiv.org/format/2309.10917">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> End-to-End Speech Recognition Contextualization with Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lakomkin%2C+E">Egor Lakomkin</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+C">Chunyang Wu</a>, <a href="/search/cs?searchtype=author&query=Fathullah%2C+Y">Yassir Fathullah</a>, <a href="/search/cs?searchtype=author&query=Kalinli%2C+O">Ozlem Kalinli</a>, <a href="/search/cs?searchtype=author&query=Seltzer%2C+M+L">Michael L. Seltzer</a>, <a href="/search/cs?searchtype=author&query=Fuegen%2C+C">Christian Fuegen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.10917v1-abstract-short" style="display: inline;"> In recent years, Large Language Models (LLMs) have garnered significant attention from the research community due to their exceptional performance and generalization capabilities. In this paper, we introduce a novel method for contextualizing speech recognition models incorporating LLMs. Our approach casts speech recognition as a mixed-modal language modeling task based on a pretrained LLM. We pro… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.10917v1-abstract-full').style.display = 'inline'; document.getElementById('2309.10917v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.10917v1-abstract-full" style="display: none;"> In recent years, Large Language Models (LLMs) have garnered significant attention from the research community due to their exceptional performance and generalization capabilities. In this paper, we introduce a novel method for contextualizing speech recognition models incorporating LLMs. Our approach casts speech recognition as a mixed-modal language modeling task based on a pretrained LLM. We provide audio features, along with optional text tokens for context, to train the system to complete transcriptions in a decoder-only fashion. As a result, the system is implicitly incentivized to learn how to leverage unstructured contextual information during training. Our empirical results demonstrate a significant improvement in performance, with a 6% WER reduction when additional textual context is provided. Moreover, we find that our method performs competitively and improve by 7.5% WER overall and 17% WER on rare words against a baseline contextualized RNN-T system that has been trained on more than twenty five times larger speech dataset. Overall, we demonstrate that by only adding a handful number of trainable parameters via adapters, we can unlock contextualized speech recognition capability for the pretrained LLM while keeping the same text-only input functionality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.10917v1-abstract-full').style.display = 'none'; document.getElementById('2309.10917v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2307.11795">arXiv:2307.11795</a> <span> [<a href="https://arxiv.org/pdf/2307.11795">pdf</a>, <a href="https://arxiv.org/format/2307.11795">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Prompting Large Language Models with Speech Recognition Abilities </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fathullah%2C+Y">Yassir Fathullah</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+C">Chunyang Wu</a>, <a href="/search/cs?searchtype=author&query=Lakomkin%2C+E">Egor Lakomkin</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+J">Junteng Jia</a>, <a href="/search/cs?searchtype=author&query=Shangguan%2C+Y">Yuan Shangguan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+K">Ke Li</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+J">Jinxi Guo</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+W">Wenhan Xiong</a>, <a href="/search/cs?searchtype=author&query=Mahadeokar%2C+J">Jay Mahadeokar</a>, <a href="/search/cs?searchtype=author&query=Kalinli%2C+O">Ozlem Kalinli</a>, <a href="/search/cs?searchtype=author&query=Fuegen%2C+C">Christian Fuegen</a>, <a href="/search/cs?searchtype=author&query=Seltzer%2C+M">Mike Seltzer</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2307.11795v1-abstract-short" style="display: inline;"> Large language models have proven themselves highly flexible, able to solve a wide range of generative tasks, such as abstractive summarization and open-ended question answering. In this paper we extend the capabilities of LLMs by directly attaching a small audio encoder allowing it to perform speech recognition. By directly prepending a sequence of audial embeddings to the text token embeddings,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.11795v1-abstract-full').style.display = 'inline'; document.getElementById('2307.11795v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2307.11795v1-abstract-full" style="display: none;"> Large language models have proven themselves highly flexible, able to solve a wide range of generative tasks, such as abstractive summarization and open-ended question answering. In this paper we extend the capabilities of LLMs by directly attaching a small audio encoder allowing it to perform speech recognition. By directly prepending a sequence of audial embeddings to the text token embeddings, the LLM can be converted to an automatic speech recognition (ASR) system, and be used in the exact same manner as its textual counterpart. Experiments on Multilingual LibriSpeech (MLS) show that incorporating a conformer encoder into the open sourced LLaMA-7B allows it to outperform monolingual baselines by 18% and perform multilingual speech recognition despite LLaMA being trained overwhelmingly on English text. Furthermore, we perform ablation studies to investigate whether the LLM can be completely frozen during training to maintain its original capabilities, scaling up the audio encoder, and increasing the audio encoder striding to generate fewer embeddings. The results from these studies show that multilingual ASR is possible even when the LLM is frozen or when strides of almost 1 second are used in the audio encoder opening up the possibility for LLMs to operate on long-form audio. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.11795v1-abstract-full').style.display = 'none'; document.getElementById('2307.11795v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2303.17200">arXiv:2303.17200</a> <span> [<a href="https://arxiv.org/pdf/2303.17200">pdf</a>, <a href="https://arxiv.org/format/2303.17200">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> SynthVSR: Scaling Up Visual Speech Recognition With Synthetic Supervision </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xubo Liu</a>, <a href="/search/cs?searchtype=author&query=Lakomkin%2C+E">Egor Lakomkin</a>, <a href="/search/cs?searchtype=author&query=Vougioukas%2C+K">Konstantinos Vougioukas</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+P">Pingchuan Ma</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Honglie Chen</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+R">Ruiming Xie</a>, <a href="/search/cs?searchtype=author&query=Doulaty%2C+M">Morrie Doulaty</a>, <a href="/search/cs?searchtype=author&query=Moritz%2C+N">Niko Moritz</a>, <a href="/search/cs?searchtype=author&query=Kol%C3%A1%C5%99%2C+J">J谩chym Kol谩艡</a>, <a href="/search/cs?searchtype=author&query=Petridis%2C+S">Stavros Petridis</a>, <a href="/search/cs?searchtype=author&query=Pantic%2C+M">Maja Pantic</a>, <a href="/search/cs?searchtype=author&query=Fuegen%2C+C">Christian Fuegen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2303.17200v2-abstract-short" style="display: inline;"> Recently reported state-of-the-art results in visual speech recognition (VSR) often rely on increasingly large amounts of video data, while the publicly available transcribed video datasets are limited in size. In this paper, for the first time, we study the potential of leveraging synthetic visual data for VSR. Our method, termed SynthVSR, substantially improves the performance of VSR systems wit… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.17200v2-abstract-full').style.display = 'inline'; document.getElementById('2303.17200v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2303.17200v2-abstract-full" style="display: none;"> Recently reported state-of-the-art results in visual speech recognition (VSR) often rely on increasingly large amounts of video data, while the publicly available transcribed video datasets are limited in size. In this paper, for the first time, we study the potential of leveraging synthetic visual data for VSR. Our method, termed SynthVSR, substantially improves the performance of VSR systems with synthetic lip movements. The key idea behind SynthVSR is to leverage a speech-driven lip animation model that generates lip movements conditioned on the input speech. The speech-driven lip animation model is trained on an unlabeled audio-visual dataset and could be further optimized towards a pre-trained VSR model when labeled videos are available. As plenty of transcribed acoustic data and face images are available, we are able to generate large-scale synthetic data using the proposed lip animation model for semi-supervised VSR training. We evaluate the performance of our approach on the largest public VSR benchmark - Lip Reading Sentences 3 (LRS3). SynthVSR achieves a WER of 43.3% with only 30 hours of real labeled data, outperforming off-the-shelf approaches using thousands of hours of video. The WER is further reduced to 27.9% when using all 438 hours of labeled data from LRS3, which is on par with the state-of-the-art self-supervised AV-HuBERT method. Furthermore, when combined with large-scale pseudo-labeled audio-visual data SynthVSR yields a new state-of-the-art VSR WER of 16.9% using publicly available data only, surpassing the recent state-of-the-art approaches trained with 29 times more non-public machine-transcribed video data (90,000 hours). Finally, we perform extensive ablation studies to understand the effect of each component in our proposed method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.17200v2-abstract-full').style.display = 'none'; document.getElementById('2303.17200v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 April, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">IEEE/CVF CVPR 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2211.02133">arXiv:2211.02133</a> <span> [<a href="https://arxiv.org/pdf/2211.02133">pdf</a>, <a href="https://arxiv.org/format/2211.02133">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Streaming Audio-Visual Speech Recognition with Alignment Regularization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ma%2C+P">Pingchuan Ma</a>, <a href="/search/cs?searchtype=author&query=Moritz%2C+N">Niko Moritz</a>, <a href="/search/cs?searchtype=author&query=Petridis%2C+S">Stavros Petridis</a>, <a href="/search/cs?searchtype=author&query=Fuegen%2C+C">Christian Fuegen</a>, <a href="/search/cs?searchtype=author&query=Pantic%2C+M">Maja Pantic</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2211.02133v2-abstract-short" style="display: inline;"> In this work, we propose a streaming AV-ASR system based on a hybrid connectionist temporal classification (CTC)/attention neural network architecture. The audio and the visual encoder neural networks are both based on the conformer architecture, which is made streamable using chunk-wise self-attention (CSA) and causal convolution. Streaming recognition with a decoder neural network is realized by… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.02133v2-abstract-full').style.display = 'inline'; document.getElementById('2211.02133v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2211.02133v2-abstract-full" style="display: none;"> In this work, we propose a streaming AV-ASR system based on a hybrid connectionist temporal classification (CTC)/attention neural network architecture. The audio and the visual encoder neural networks are both based on the conformer architecture, which is made streamable using chunk-wise self-attention (CSA) and causal convolution. Streaming recognition with a decoder neural network is realized by using the triggered attention technique, which performs time-synchronous decoding with joint CTC/attention scoring. Additionally, we propose a novel alignment regularization technique that promotes synchronization of the audio and visual encoder, which in turn results in better word error rates (WERs) at all SNR levels for streaming and offline AV-ASR models. The proposed AV-ASR model achieves WERs of 2.0% and 2.6% on the Lip Reading Sentences 3 (LRS3) dataset in an offline and online setup, respectively, which both present state-of-the-art results when no external training data are used. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.02133v2-abstract-full').style.display = 'none'; document.getElementById('2211.02133v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to Interspeech 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2204.08858">arXiv:2204.08858</a> <span> [<a href="https://arxiv.org/pdf/2204.08858">pdf</a>, <a href="https://arxiv.org/format/2204.08858">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> An Investigation of Monotonic Transducers for Large-Scale Automatic Speech Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Moritz%2C+N">Niko Moritz</a>, <a href="/search/cs?searchtype=author&query=Seide%2C+F">Frank Seide</a>, <a href="/search/cs?searchtype=author&query=Le%2C+D">Duc Le</a>, <a href="/search/cs?searchtype=author&query=Mahadeokar%2C+J">Jay Mahadeokar</a>, <a href="/search/cs?searchtype=author&query=Fuegen%2C+C">Christian Fuegen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2204.08858v2-abstract-short" style="display: inline;"> The two most popular loss functions for streaming end-to-end automatic speech recognition (ASR) are RNN-Transducer (RNN-T) and connectionist temporal classification (CTC). Between these two loss types we can classify the monotonic RNN-T (MonoRNN-T) and the recently proposed CTC-like Transducer (CTC-T). Monotonic transducers have a few advantages. First, RNN-T can suffer from runaway hallucination,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2204.08858v2-abstract-full').style.display = 'inline'; document.getElementById('2204.08858v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2204.08858v2-abstract-full" style="display: none;"> The two most popular loss functions for streaming end-to-end automatic speech recognition (ASR) are RNN-Transducer (RNN-T) and connectionist temporal classification (CTC). Between these two loss types we can classify the monotonic RNN-T (MonoRNN-T) and the recently proposed CTC-like Transducer (CTC-T). Monotonic transducers have a few advantages. First, RNN-T can suffer from runaway hallucination, where a model keeps emitting non-blank symbols without advancing in time. Secondly, monotonic transducers consume exactly one model score per time step and are therefore more compatible with traditional FST-based ASR decoders. However, the MonoRNN-T so far has been found to have worse accuracy than RNN-T. It does not have to be that way: By regularizing the training via joint LAS training or parameter initialization from RNN-T, both MonoRNN-T and CTC-T perform as well or better than RNN-T. This is demonstrated for LibriSpeech and for a large-scale in-house data set. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2204.08858v2-abstract-full').style.display = 'none'; document.getElementById('2204.08858v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 April, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to SLT 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2111.05948">arXiv:2111.05948</a> <span> [<a href="https://arxiv.org/pdf/2111.05948">pdf</a>, <a href="https://arxiv.org/format/2111.05948">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Scaling ASR Improves Zero and Few Shot Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiao%2C+A">Alex Xiao</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+W">Weiyi Zheng</a>, <a href="/search/cs?searchtype=author&query=Keren%2C+G">Gil Keren</a>, <a href="/search/cs?searchtype=author&query=Le%2C+D">Duc Le</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+F">Frank Zhang</a>, <a href="/search/cs?searchtype=author&query=Fuegen%2C+C">Christian Fuegen</a>, <a href="/search/cs?searchtype=author&query=Kalinli%2C+O">Ozlem Kalinli</a>, <a href="/search/cs?searchtype=author&query=Saraf%2C+Y">Yatharth Saraf</a>, <a href="/search/cs?searchtype=author&query=Mohamed%2C+A">Abdelrahman Mohamed</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2111.05948v3-abstract-short" style="display: inline;"> With 4.5 million hours of English speech from 10 different sources across 120 countries and models of up to 10 billion parameters, we explore the frontiers of scale for automatic speech recognition. We propose data selection techniques to efficiently scale training data to find the most valuable samples in massive datasets. To efficiently scale model sizes, we leverage various optimizations such a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2111.05948v3-abstract-full').style.display = 'inline'; document.getElementById('2111.05948v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2111.05948v3-abstract-full" style="display: none;"> With 4.5 million hours of English speech from 10 different sources across 120 countries and models of up to 10 billion parameters, we explore the frontiers of scale for automatic speech recognition. We propose data selection techniques to efficiently scale training data to find the most valuable samples in massive datasets. To efficiently scale model sizes, we leverage various optimizations such as sparse transducer loss and model sharding. By training 1-10B parameter universal English ASR models, we push the limits of speech recognition performance across many domains. Furthermore, our models learn powerful speech representations with zero and few-shot capabilities on novel domains and styles of speech, exceeding previous results across multiple in-house and public benchmarks. For speakers with disorders due to brain damage, our best zero-shot and few-shot models achieve 22% and 60% relative improvement on the AphasiaBank test set, respectively, while realizing the best performance on public social media videos. Furthermore, the same universal model reaches equivalent performance with 500x less in-domain data on the SPGISpeech financial-domain dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2111.05948v3-abstract-full').style.display = 'none'; document.getElementById('2111.05948v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 November, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 November, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2110.07058">arXiv:2110.07058</a> <span> [<a href="https://arxiv.org/pdf/2110.07058">pdf</a>, <a href="https://arxiv.org/format/2110.07058">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Ego4D: Around the World in 3,000 Hours of Egocentric Video </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Grauman%2C+K">Kristen Grauman</a>, <a href="/search/cs?searchtype=author&query=Westbury%2C+A">Andrew Westbury</a>, <a href="/search/cs?searchtype=author&query=Byrne%2C+E">Eugene Byrne</a>, <a href="/search/cs?searchtype=author&query=Chavis%2C+Z">Zachary Chavis</a>, <a href="/search/cs?searchtype=author&query=Furnari%2C+A">Antonino Furnari</a>, <a href="/search/cs?searchtype=author&query=Girdhar%2C+R">Rohit Girdhar</a>, <a href="/search/cs?searchtype=author&query=Hamburger%2C+J">Jackson Hamburger</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+H">Hao Jiang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+M">Miao Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xingyu Liu</a>, <a href="/search/cs?searchtype=author&query=Martin%2C+M">Miguel Martin</a>, <a href="/search/cs?searchtype=author&query=Nagarajan%2C+T">Tushar Nagarajan</a>, <a href="/search/cs?searchtype=author&query=Radosavovic%2C+I">Ilija Radosavovic</a>, <a href="/search/cs?searchtype=author&query=Ramakrishnan%2C+S+K">Santhosh Kumar Ramakrishnan</a>, <a href="/search/cs?searchtype=author&query=Ryan%2C+F">Fiona Ryan</a>, <a href="/search/cs?searchtype=author&query=Sharma%2C+J">Jayant Sharma</a>, <a href="/search/cs?searchtype=author&query=Wray%2C+M">Michael Wray</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+M">Mengmeng Xu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+E+Z">Eric Zhongcong Xu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+C">Chen Zhao</a>, <a href="/search/cs?searchtype=author&query=Bansal%2C+S">Siddhant Bansal</a>, <a href="/search/cs?searchtype=author&query=Batra%2C+D">Dhruv Batra</a>, <a href="/search/cs?searchtype=author&query=Cartillier%2C+V">Vincent Cartillier</a>, <a href="/search/cs?searchtype=author&query=Crane%2C+S">Sean Crane</a>, <a href="/search/cs?searchtype=author&query=Do%2C+T">Tien Do</a> , et al. (60 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2110.07058v3-abstract-short" style="display: inline;"> We introduce Ego4D, a massive-scale egocentric video dataset and benchmark suite. It offers 3,670 hours of daily-life activity video spanning hundreds of scenarios (household, outdoor, workplace, leisure, etc.) captured by 931 unique camera wearers from 74 worldwide locations and 9 different countries. The approach to collection is designed to uphold rigorous privacy and ethics standards with cons… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.07058v3-abstract-full').style.display = 'inline'; document.getElementById('2110.07058v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2110.07058v3-abstract-full" style="display: none;"> We introduce Ego4D, a massive-scale egocentric video dataset and benchmark suite. It offers 3,670 hours of daily-life activity video spanning hundreds of scenarios (household, outdoor, workplace, leisure, etc.) captured by 931 unique camera wearers from 74 worldwide locations and 9 different countries. The approach to collection is designed to uphold rigorous privacy and ethics standards with consenting participants and robust de-identification procedures where relevant. Ego4D dramatically expands the volume of diverse egocentric video footage publicly available to the research community. Portions of the video are accompanied by audio, 3D meshes of the environment, eye gaze, stereo, and/or synchronized videos from multiple egocentric cameras at the same event. Furthermore, we present a host of new benchmark challenges centered around understanding the first-person visual experience in the past (querying an episodic memory), present (analyzing hand-object manipulation, audio-visual conversation, and social interactions), and future (forecasting activities). By publicly sharing this massive annotated dataset and benchmark suite, we aim to push the frontier of first-person perception. Project page: https://ego4d-data.org/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.07058v3-abstract-full').style.display = 'none'; document.getElementById('2110.07058v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 March, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 October, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To appear in the Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR), 2022. This version updates the baseline result numbers for the Hands and Objects benchmark (appendix)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2110.05376">arXiv:2110.05376</a> <span> [<a href="https://arxiv.org/pdf/2110.05376">pdf</a>, <a href="https://arxiv.org/format/2110.05376">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Evaluating User Perception of Speech Recognition System Quality with Semantic Distance Metric </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kim%2C+S">Suyoun Kim</a>, <a href="/search/cs?searchtype=author&query=Le%2C+D">Duc Le</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+W">Weiyi Zheng</a>, <a href="/search/cs?searchtype=author&query=Singh%2C+T">Tarun Singh</a>, <a href="/search/cs?searchtype=author&query=Arora%2C+A">Abhinav Arora</a>, <a href="/search/cs?searchtype=author&query=Zhai%2C+X">Xiaoyu Zhai</a>, <a href="/search/cs?searchtype=author&query=Fuegen%2C+C">Christian Fuegen</a>, <a href="/search/cs?searchtype=author&query=Kalinli%2C+O">Ozlem Kalinli</a>, <a href="/search/cs?searchtype=author&query=Seltzer%2C+M+L">Michael L. Seltzer</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2110.05376v2-abstract-short" style="display: inline;"> Measuring automatic speech recognition (ASR) system quality is critical for creating user-satisfying voice-driven applications. Word Error Rate (WER) has been traditionally used to evaluate ASR system quality; however, it sometimes correlates poorly with user perception/judgement of transcription quality. This is because WER weighs every word equally and does not consider semantic correctness whic… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.05376v2-abstract-full').style.display = 'inline'; document.getElementById('2110.05376v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2110.05376v2-abstract-full" style="display: none;"> Measuring automatic speech recognition (ASR) system quality is critical for creating user-satisfying voice-driven applications. Word Error Rate (WER) has been traditionally used to evaluate ASR system quality; however, it sometimes correlates poorly with user perception/judgement of transcription quality. This is because WER weighs every word equally and does not consider semantic correctness which has a higher impact on user perception. In this work, we propose evaluating ASR output hypotheses quality with SemDist that can measure semantic correctness by using the distance between the semantic vectors of the reference and hypothesis extracted from a pre-trained language model. Our experimental results of 71K and 36K user annotated ASR output quality show that SemDist achieves higher correlation with user perception than WER. We also show that SemDist has higher correlation with downstream Natural Language Understanding (NLU) tasks than WER. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.05376v2-abstract-full').style.display = 'none'; document.getElementById('2110.05376v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 July, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 October, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">INTERSPEECH 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2106.11335">arXiv:2106.11335</a> <span> [<a href="https://arxiv.org/pdf/2106.11335">pdf</a>, <a href="https://arxiv.org/format/2106.11335">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Do sound event representations generalize to other audio tasks? A case study in audio transfer learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kumar%2C+A">Anurag Kumar</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yun Wang</a>, <a href="/search/cs?searchtype=author&query=Ithapu%2C+V+K">Vamsi Krishna Ithapu</a>, <a href="/search/cs?searchtype=author&query=Fuegen%2C+C">Christian Fuegen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2106.11335v1-abstract-short" style="display: inline;"> Transfer learning is critical for efficient information transfer across multiple related learning problems. A simple, yet effective transfer learning approach utilizes deep neural networks trained on a large-scale task for feature extraction. Such representations are then used to learn related downstream tasks. In this paper, we investigate transfer learning capacity of audio representations obtai… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2106.11335v1-abstract-full').style.display = 'inline'; document.getElementById('2106.11335v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2106.11335v1-abstract-full" style="display: none;"> Transfer learning is critical for efficient information transfer across multiple related learning problems. A simple, yet effective transfer learning approach utilizes deep neural networks trained on a large-scale task for feature extraction. Such representations are then used to learn related downstream tasks. In this paper, we investigate transfer learning capacity of audio representations obtained from neural networks trained on a large-scale sound event detection dataset. We build and evaluate these representations across a wide range of other audio tasks, via a simple linear classifier transfer mechanism. We show that such simple linear transfer is already powerful enough to achieve high performance on the downstream tasks. We also provide insights into the attributes of sound event representations that enable such efficient information transfer. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2106.11335v1-abstract-full').style.display = 'none'; document.getElementById('2106.11335v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 June, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted Interspeech 2021</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2104.02232">arXiv:2104.02232</a> <span> [<a href="https://arxiv.org/pdf/2104.02232">pdf</a>, <a href="https://arxiv.org/format/2104.02232">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Flexi-Transducer: Optimizing Latency, Accuracy and Compute forMulti-Domain On-Device Scenarios </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mahadeokar%2C+J">Jay Mahadeokar</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yangyang Shi</a>, <a href="/search/cs?searchtype=author&query=Shangguan%2C+Y">Yuan Shangguan</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+C">Chunyang Wu</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+A">Alex Xiao</a>, <a href="/search/cs?searchtype=author&query=Su%2C+H">Hang Su</a>, <a href="/search/cs?searchtype=author&query=Le%2C+D">Duc Le</a>, <a href="/search/cs?searchtype=author&query=Kalinli%2C+O">Ozlem Kalinli</a>, <a href="/search/cs?searchtype=author&query=Fuegen%2C+C">Christian Fuegen</a>, <a href="/search/cs?searchtype=author&query=Seltzer%2C+M+L">Michael L. Seltzer</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2104.02232v1-abstract-short" style="display: inline;"> Often, the storage and computational constraints of embeddeddevices demand that a single on-device ASR model serve multiple use-cases / domains. In this paper, we propose aFlexibleTransducer(FlexiT) for on-device automatic speech recognition to flexibly deal with multiple use-cases / domains with different accuracy and latency requirements. Specifically, using a single compact model, FlexiT provid… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2104.02232v1-abstract-full').style.display = 'inline'; document.getElementById('2104.02232v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2104.02232v1-abstract-full" style="display: none;"> Often, the storage and computational constraints of embeddeddevices demand that a single on-device ASR model serve multiple use-cases / domains. In this paper, we propose aFlexibleTransducer(FlexiT) for on-device automatic speech recognition to flexibly deal with multiple use-cases / domains with different accuracy and latency requirements. Specifically, using a single compact model, FlexiT provides a fast response for voice commands, and accurate transcription but with more latency for dictation. In order to achieve flexible and better accuracy and latency trade-offs, the following techniques are used. Firstly, we propose using domain-specific altering of segment size for Emformer encoder that enables FlexiT to achieve flexible de-coding. Secondly, we use Alignment Restricted RNNT loss to achieve flexible fine-grained control on token emission latency for different domains. Finally, we add a domain indicator vector as an additional input to the FlexiT model. Using the combination of techniques, we show that a single model can be used to improve WERs and real time factor for dictation scenarios while maintaining optimal latency for voice commands use-cases <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2104.02232v1-abstract-full').style.display = 'none'; document.getElementById('2104.02232v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 April, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to Interspeech 2021 (under review)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2104.02207">arXiv:2104.02207</a> <span> [<a href="https://arxiv.org/pdf/2104.02207">pdf</a>, <a href="https://arxiv.org/format/2104.02207">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Dissecting User-Perceived Latency of On-Device E2E Speech Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shangguan%2C+Y">Yuan Shangguan</a>, <a href="/search/cs?searchtype=author&query=Prabhavalkar%2C+R">Rohit Prabhavalkar</a>, <a href="/search/cs?searchtype=author&query=Su%2C+H">Hang Su</a>, <a href="/search/cs?searchtype=author&query=Mahadeokar%2C+J">Jay Mahadeokar</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yangyang Shi</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jiatong Zhou</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+C">Chunyang Wu</a>, <a href="/search/cs?searchtype=author&query=Le%2C+D">Duc Le</a>, <a href="/search/cs?searchtype=author&query=Kalinli%2C+O">Ozlem Kalinli</a>, <a href="/search/cs?searchtype=author&query=Fuegen%2C+C">Christian Fuegen</a>, <a href="/search/cs?searchtype=author&query=Seltzer%2C+M+L">Michael L. Seltzer</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2104.02207v3-abstract-short" style="display: inline;"> As speech-enabled devices such as smartphones and smart speakers become increasingly ubiquitous, there is growing interest in building automatic speech recognition (ASR) systems that can run directly on-device; end-to-end (E2E) speech recognition models such as recurrent neural network transducers and their variants have recently emerged as prime candidates for this task. Apart from being accurate… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2104.02207v3-abstract-full').style.display = 'inline'; document.getElementById('2104.02207v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2104.02207v3-abstract-full" style="display: none;"> As speech-enabled devices such as smartphones and smart speakers become increasingly ubiquitous, there is growing interest in building automatic speech recognition (ASR) systems that can run directly on-device; end-to-end (E2E) speech recognition models such as recurrent neural network transducers and their variants have recently emerged as prime candidates for this task. Apart from being accurate and compact, such systems need to decode speech with low user-perceived latency (UPL), producing words as soon as they are spoken. This work examines the impact of various techniques - model architectures, training criteria, decoding hyperparameters, and endpointer parameters - on UPL. Our analyses suggest that measures of model size (parameters, input chunk sizes), or measures of computation (e.g., FLOPS, RTF) that reflect the model's ability to process input frames are not always strongly correlated with observed UPL. Thus, conventional algorithmic latency measurements might be inadequate in accurately capturing latency observed when models are deployed on embedded devices. Instead, we find that factors affecting token emission latency, and endpointing behavior have a larger impact on UPL. We achieve the best trade-off between latency and word error rate when performing ASR jointly with endpointing, while utilizing the recently proposed alignment regularization mechanism. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2104.02207v3-abstract-full').style.display = 'none'; document.getElementById('2104.02207v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 August, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 April, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Proc. of Interspeech 2021</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2104.02194">arXiv:2104.02194</a> <span> [<a href="https://arxiv.org/pdf/2104.02194">pdf</a>, <a href="https://arxiv.org/format/2104.02194">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Contextualized Streaming End-to-End Speech Recognition with Trie-Based Deep Biasing and Shallow Fusion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Le%2C+D">Duc Le</a>, <a href="/search/cs?searchtype=author&query=Jain%2C+M">Mahaveer Jain</a>, <a href="/search/cs?searchtype=author&query=Keren%2C+G">Gil Keren</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S">Suyoun Kim</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yangyang Shi</a>, <a href="/search/cs?searchtype=author&query=Mahadeokar%2C+J">Jay Mahadeokar</a>, <a href="/search/cs?searchtype=author&query=Chan%2C+J">Julian Chan</a>, <a href="/search/cs?searchtype=author&query=Shangguan%2C+Y">Yuan Shangguan</a>, <a href="/search/cs?searchtype=author&query=Fuegen%2C+C">Christian Fuegen</a>, <a href="/search/cs?searchtype=author&query=Kalinli%2C+O">Ozlem Kalinli</a>, <a href="/search/cs?searchtype=author&query=Saraf%2C+Y">Yatharth Saraf</a>, <a href="/search/cs?searchtype=author&query=Seltzer%2C+M+L">Michael L. Seltzer</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2104.02194v2-abstract-short" style="display: inline;"> How to leverage dynamic contextual information in end-to-end speech recognition has remained an active research area. Previous solutions to this problem were either designed for specialized use cases that did not generalize well to open-domain scenarios, did not scale to large biasing lists, or underperformed on rare long-tail words. We address these limitations by proposing a novel solution that… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2104.02194v2-abstract-full').style.display = 'inline'; document.getElementById('2104.02194v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2104.02194v2-abstract-full" style="display: none;"> How to leverage dynamic contextual information in end-to-end speech recognition has remained an active research area. Previous solutions to this problem were either designed for specialized use cases that did not generalize well to open-domain scenarios, did not scale to large biasing lists, or underperformed on rare long-tail words. We address these limitations by proposing a novel solution that combines shallow fusion, trie-based deep biasing, and neural network language model contextualization. These techniques result in significant 19.5% relative Word Error Rate improvement over existing contextual biasing approaches and 5.4%-9.3% improvement compared to a strong hybrid baseline on both open-domain and constrained contextualization tasks, where the targets consist of mostly rare long-tail words. Our final system remains lightweight and modular, allowing for quick modification without model re-training. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2104.02194v2-abstract-full').style.display = 'none'; document.getElementById('2104.02194v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 June, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 April, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted for presentation at INTERSPEECH 2021</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2104.02176">arXiv:2104.02176</a> <span> [<a href="https://arxiv.org/pdf/2104.02176">pdf</a>, <a href="https://arxiv.org/format/2104.02176">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Dynamic Encoder Transducer: A Flexible Solution For Trading Off Accuracy For Latency </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yangyang Shi</a>, <a href="/search/cs?searchtype=author&query=Nagaraja%2C+V">Varun Nagaraja</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+C">Chunyang Wu</a>, <a href="/search/cs?searchtype=author&query=Mahadeokar%2C+J">Jay Mahadeokar</a>, <a href="/search/cs?searchtype=author&query=Le%2C+D">Duc Le</a>, <a href="/search/cs?searchtype=author&query=Prabhavalkar%2C+R">Rohit Prabhavalkar</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+A">Alex Xiao</a>, <a href="/search/cs?searchtype=author&query=Yeh%2C+C">Ching-Feng Yeh</a>, <a href="/search/cs?searchtype=author&query=Chan%2C+J">Julian Chan</a>, <a href="/search/cs?searchtype=author&query=Fuegen%2C+C">Christian Fuegen</a>, <a href="/search/cs?searchtype=author&query=Kalinli%2C+O">Ozlem Kalinli</a>, <a href="/search/cs?searchtype=author&query=Seltzer%2C+M+L">Michael L. Seltzer</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2104.02176v1-abstract-short" style="display: inline;"> We propose a dynamic encoder transducer (DET) for on-device speech recognition. One DET model scales to multiple devices with different computation capacities without retraining or finetuning. To trading off accuracy and latency, DET assigns different encoders to decode different parts of an utterance. We apply and compare the layer dropout and the collaborative learning for DET training. The laye… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2104.02176v1-abstract-full').style.display = 'inline'; document.getElementById('2104.02176v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2104.02176v1-abstract-full" style="display: none;"> We propose a dynamic encoder transducer (DET) for on-device speech recognition. One DET model scales to multiple devices with different computation capacities without retraining or finetuning. To trading off accuracy and latency, DET assigns different encoders to decode different parts of an utterance. We apply and compare the layer dropout and the collaborative learning for DET training. The layer dropout method that randomly drops out encoder layers in the training phase, can do on-demand layer dropout in decoding. Collaborative learning jointly trains multiple encoders with different depths in one single model. Experiment results on Librispeech and in-house data show that DET provides a flexible accuracy and latency trade-off. Results on Librispeech show that the full-size encoder in DET relatively reduces the word error rate of the same size baseline by over 8%. The lightweight encoder in DET trained with collaborative learning reduces the model size by 25% but still gets similar WER as the full-size baseline. DET gets similar accuracy as a baseline model with better latency on a large in-house data set by assigning a lightweight encoder for the beginning part of one utterance and a full-size encoder for the rest. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2104.02176v1-abstract-full').style.display = 'none'; document.getElementById('2104.02176v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 April, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 2 figures, submitted Interspeech 2021</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2104.02138">arXiv:2104.02138</a> <span> [<a href="https://arxiv.org/pdf/2104.02138">pdf</a>, <a href="https://arxiv.org/format/2104.02138">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Semantic Distance: A New Metric for ASR Performance Analysis Towards Spoken Language Understanding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kim%2C+S">Suyoun Kim</a>, <a href="/search/cs?searchtype=author&query=Arora%2C+A">Abhinav Arora</a>, <a href="/search/cs?searchtype=author&query=Le%2C+D">Duc Le</a>, <a href="/search/cs?searchtype=author&query=Yeh%2C+C">Ching-Feng Yeh</a>, <a href="/search/cs?searchtype=author&query=Fuegen%2C+C">Christian Fuegen</a>, <a href="/search/cs?searchtype=author&query=Kalinli%2C+O">Ozlem Kalinli</a>, <a href="/search/cs?searchtype=author&query=Seltzer%2C+M+L">Michael L. Seltzer</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2104.02138v1-abstract-short" style="display: inline;"> Word Error Rate (WER) has been the predominant metric used to evaluate the performance of automatic speech recognition (ASR) systems. However, WER is sometimes not a good indicator for downstream Natural Language Understanding (NLU) tasks, such as intent recognition, slot filling, and semantic parsing in task-oriented dialog systems. This is because WER takes into consideration only literal correc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2104.02138v1-abstract-full').style.display = 'inline'; document.getElementById('2104.02138v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2104.02138v1-abstract-full" style="display: none;"> Word Error Rate (WER) has been the predominant metric used to evaluate the performance of automatic speech recognition (ASR) systems. However, WER is sometimes not a good indicator for downstream Natural Language Understanding (NLU) tasks, such as intent recognition, slot filling, and semantic parsing in task-oriented dialog systems. This is because WER takes into consideration only literal correctness instead of semantic correctness, the latter of which is typically more important for these downstream tasks. In this study, we propose a novel Semantic Distance (SemDist) measure as an alternative evaluation metric for ASR systems to address this issue. We define SemDist as the distance between a reference and hypothesis pair in a sentence-level embedding space. To represent the reference and hypothesis as a sentence embedding, we exploit RoBERTa, a state-of-the-art pre-trained deep contextualized language model based on the transformer architecture. We demonstrate the effectiveness of our proposed metric on various downstream tasks, including intent recognition, semantic parsing, and named entity recognition. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2104.02138v1-abstract-full').style.display = 'none'; document.getElementById('2104.02138v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 April, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">submitted to Interspeech 2021</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2103.05149">arXiv:2103.05149</a> <span> [<a href="https://arxiv.org/pdf/2103.05149">pdf</a>, <a href="https://arxiv.org/ps/2103.05149">ps</a>, <a href="https://arxiv.org/format/2103.05149">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Contrastive Semi-supervised Learning for ASR </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiao%2C+A">Alex Xiao</a>, <a href="/search/cs?searchtype=author&query=Fuegen%2C+C">Christian Fuegen</a>, <a href="/search/cs?searchtype=author&query=Mohamed%2C+A">Abdelrahman Mohamed</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2103.05149v1-abstract-short" style="display: inline;"> Pseudo-labeling is the most adopted method for pre-training automatic speech recognition (ASR) models. However, its performance suffers from the supervised teacher model's degrading quality in low-resource setups and under domain transfer. Inspired by the successes of contrastive representation learning for computer vision and speech applications, and more recently for supervised learning of visua… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2103.05149v1-abstract-full').style.display = 'inline'; document.getElementById('2103.05149v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2103.05149v1-abstract-full" style="display: none;"> Pseudo-labeling is the most adopted method for pre-training automatic speech recognition (ASR) models. However, its performance suffers from the supervised teacher model's degrading quality in low-resource setups and under domain transfer. Inspired by the successes of contrastive representation learning for computer vision and speech applications, and more recently for supervised learning of visual objects, we propose Contrastive Semi-supervised Learning (CSL). CSL eschews directly predicting teacher-generated pseudo-labels in favor of utilizing them to select positive and negative examples. In the challenging task of transcribing public social media videos, using CSL reduces the WER by 8% compared to the standard Cross-Entropy pseudo-labeling (CE-PL) when 10hr of supervised data is used to annotate 75,000hr of videos. The WER reduction jumps to 19% under the ultra low-resource condition of using 1hr labels for teacher supervision. CSL generalizes much better in out-of-domain conditions, showing up to 17% WER reduction compared to the best CE-PL pre-trained model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2103.05149v1-abstract-full').style.display = 'none'; document.getElementById('2103.05149v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 March, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2102.11531">arXiv:2102.11531</a> <span> [<a href="https://arxiv.org/pdf/2102.11531">pdf</a>, <a href="https://arxiv.org/format/2102.11531">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Memory-efficient Speech Recognition on Smart Devices </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Venkatesh%2C+G">Ganesh Venkatesh</a>, <a href="/search/cs?searchtype=author&query=Valliappan%2C+A">Alagappan Valliappan</a>, <a href="/search/cs?searchtype=author&query=Mahadeokar%2C+J">Jay Mahadeokar</a>, <a href="/search/cs?searchtype=author&query=Shangguan%2C+Y">Yuan Shangguan</a>, <a href="/search/cs?searchtype=author&query=Fuegen%2C+C">Christian Fuegen</a>, <a href="/search/cs?searchtype=author&query=Seltzer%2C+M+L">Michael L. Seltzer</a>, <a href="/search/cs?searchtype=author&query=Chandra%2C+V">Vikas Chandra</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2102.11531v1-abstract-short" style="display: inline;"> Recurrent transducer models have emerged as a promising solution for speech recognition on the current and next generation smart devices. The transducer models provide competitive accuracy within a reasonable memory footprint alleviating the memory capacity constraints in these devices. However, these models access parameters from off-chip memory for every input time step which adversely effects d… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2102.11531v1-abstract-full').style.display = 'inline'; document.getElementById('2102.11531v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2102.11531v1-abstract-full" style="display: none;"> Recurrent transducer models have emerged as a promising solution for speech recognition on the current and next generation smart devices. The transducer models provide competitive accuracy within a reasonable memory footprint alleviating the memory capacity constraints in these devices. However, these models access parameters from off-chip memory for every input time step which adversely effects device battery life and limits their usability on low-power devices. We address transducer model's memory access concerns by optimizing their model architecture and designing novel recurrent cell designs. We demonstrate that i) model's energy cost is dominated by accessing model weights from off-chip memory, ii) transducer model architecture is pivotal in determining the number of accesses to off-chip memory and just model size is not a good proxy, iii) our transducer model optimizations and novel recurrent cell reduces off-chip memory accesses by 4.5x and model size by 2x with minimal accuracy impact. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2102.11531v1-abstract-full').style.display = 'none'; document.getElementById('2102.11531v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 February, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> ICASSP 2021 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2011.07754">arXiv:2011.07754</a> <span> [<a href="https://arxiv.org/pdf/2011.07754">pdf</a>, <a href="https://arxiv.org/format/2011.07754">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Deep Shallow Fusion for RNN-T Personalization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Le%2C+D">Duc Le</a>, <a href="/search/cs?searchtype=author&query=Keren%2C+G">Gil Keren</a>, <a href="/search/cs?searchtype=author&query=Chan%2C+J">Julian Chan</a>, <a href="/search/cs?searchtype=author&query=Mahadeokar%2C+J">Jay Mahadeokar</a>, <a href="/search/cs?searchtype=author&query=Fuegen%2C+C">Christian Fuegen</a>, <a href="/search/cs?searchtype=author&query=Seltzer%2C+M+L">Michael L. Seltzer</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2011.07754v1-abstract-short" style="display: inline;"> End-to-end models in general, and Recurrent Neural Network Transducer (RNN-T) in particular, have gained significant traction in the automatic speech recognition community in the last few years due to their simplicity, compactness, and excellent performance on generic transcription tasks. However, these models are more challenging to personalize compared to traditional hybrid systems due to the la… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2011.07754v1-abstract-full').style.display = 'inline'; document.getElementById('2011.07754v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2011.07754v1-abstract-full" style="display: none;"> End-to-end models in general, and Recurrent Neural Network Transducer (RNN-T) in particular, have gained significant traction in the automatic speech recognition community in the last few years due to their simplicity, compactness, and excellent performance on generic transcription tasks. However, these models are more challenging to personalize compared to traditional hybrid systems due to the lack of external language models and difficulties in recognizing rare long-tail words, specifically entity names. In this work, we present novel techniques to improve RNN-T's ability to model rare WordPieces, infuse extra information into the encoder, enable the use of alternative graphemic pronunciations, and perform deep fusion with personalized language models for more robust biasing. We show that these combined techniques result in 15.4%-34.5% relative Word Error Rate improvement compared to a strong RNN-T baseline which uses shallow fusion and text-to-speech augmentation. Our work helps push the boundary of RNN-T personalization and close the gap with hybrid systems on use cases where biasing and entity recognition are crucial. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2011.07754v1-abstract-full').style.display = 'none'; document.getElementById('2011.07754v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 November, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To appear at SLT 2021</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2011.03072">arXiv:2011.03072</a> <span> [<a href="https://arxiv.org/pdf/2011.03072">pdf</a>, <a href="https://arxiv.org/format/2011.03072">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Alignment Restricted Streaming Recurrent Neural Network Transducer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mahadeokar%2C+J">Jay Mahadeokar</a>, <a href="/search/cs?searchtype=author&query=Shangguan%2C+Y">Yuan Shangguan</a>, <a href="/search/cs?searchtype=author&query=Le%2C+D">Duc Le</a>, <a href="/search/cs?searchtype=author&query=Keren%2C+G">Gil Keren</a>, <a href="/search/cs?searchtype=author&query=Su%2C+H">Hang Su</a>, <a href="/search/cs?searchtype=author&query=Le%2C+T">Thong Le</a>, <a href="/search/cs?searchtype=author&query=Yeh%2C+C">Ching-Feng Yeh</a>, <a href="/search/cs?searchtype=author&query=Fuegen%2C+C">Christian Fuegen</a>, <a href="/search/cs?searchtype=author&query=Seltzer%2C+M+L">Michael L. Seltzer</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2011.03072v1-abstract-short" style="display: inline;"> There is a growing interest in the speech community in developing Recurrent Neural Network Transducer (RNN-T) models for automatic speech recognition (ASR) applications. RNN-T is trained with a loss function that does not enforce temporal alignment of the training transcripts and audio. As a result, RNN-T models built with uni-directional long short term memory (LSTM) encoders tend to wait for lon… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2011.03072v1-abstract-full').style.display = 'inline'; document.getElementById('2011.03072v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2011.03072v1-abstract-full" style="display: none;"> There is a growing interest in the speech community in developing Recurrent Neural Network Transducer (RNN-T) models for automatic speech recognition (ASR) applications. RNN-T is trained with a loss function that does not enforce temporal alignment of the training transcripts and audio. As a result, RNN-T models built with uni-directional long short term memory (LSTM) encoders tend to wait for longer spans of input audio, before streaming already decoded ASR tokens. In this work, we propose a modification to the RNN-T loss function and develop Alignment Restricted RNN-T (Ar-RNN-T) models, which utilize audio-text alignment information to guide the loss computation. We compare the proposed method with existing works, such as monotonic RNN-T, on LibriSpeech and in-house datasets. We show that the Ar-RNN-T loss provides a refined control to navigate the trade-offs between the token emission delays and the Word Error Rate (WER). The Ar-RNN-T models also improve downstream applications such as the ASR End-pointing by guaranteeing token emissions within any given range of latency. Moreover, the Ar-RNN-T loss allows for bigger batch sizes and 4 times higher throughput for our LSTM model architecture, enabling faster training and convergence on GPUs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2011.03072v1-abstract-full').style.display = 'none'; document.getElementById('2011.03072v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 November, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted for presentation at IEEE Spoken Language Technology Workshop (SLT) 2021</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2010.13878">arXiv:2010.13878</a> <span> [<a href="https://arxiv.org/pdf/2010.13878">pdf</a>, <a href="https://arxiv.org/ps/2010.13878">ps</a>, <a href="https://arxiv.org/format/2010.13878">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Improved Neural Language Model Fusion for Streaming Recurrent Neural Network Transducer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kim%2C+S">Suyoun Kim</a>, <a href="/search/cs?searchtype=author&query=Shangguan%2C+Y">Yuan Shangguan</a>, <a href="/search/cs?searchtype=author&query=Mahadeokar%2C+J">Jay Mahadeokar</a>, <a href="/search/cs?searchtype=author&query=Bruguier%2C+A">Antoine Bruguier</a>, <a href="/search/cs?searchtype=author&query=Fuegen%2C+C">Christian Fuegen</a>, <a href="/search/cs?searchtype=author&query=Seltzer%2C+M+L">Michael L. Seltzer</a>, <a href="/search/cs?searchtype=author&query=Le%2C+D">Duc Le</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2010.13878v1-abstract-short" style="display: inline;"> Recurrent Neural Network Transducer (RNN-T), like most end-to-end speech recognition model architectures, has an implicit neural network language model (NNLM) and cannot easily leverage unpaired text data during training. Previous work has proposed various fusion methods to incorporate external NNLMs into end-to-end ASR to address this weakness. In this paper, we propose extensions to these techni… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2010.13878v1-abstract-full').style.display = 'inline'; document.getElementById('2010.13878v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2010.13878v1-abstract-full" style="display: none;"> Recurrent Neural Network Transducer (RNN-T), like most end-to-end speech recognition model architectures, has an implicit neural network language model (NNLM) and cannot easily leverage unpaired text data during training. Previous work has proposed various fusion methods to incorporate external NNLMs into end-to-end ASR to address this weakness. In this paper, we propose extensions to these techniques that allow RNN-T to exploit external NNLMs during both training and inference time, resulting in 13-18% relative Word Error Rate improvement on Librispeech compared to strong baselines. Furthermore, our methods do not incur extra algorithmic latency and allow for flexible plug-and-play of different NNLMs without re-training. We also share in-depth analysis to better understand the benefits of the different NNLM fusion methods. Our work provides a reliable technique for leveraging unpaired text data to significantly improve RNN-T while keeping the system streamable, flexible, and lightweight. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2010.13878v1-abstract-full').style.display = 'none'; document.getElementById('2010.13878v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 October, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">submitted to ICASSP 2021</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2005.09137">arXiv:2005.09137</a> <span> [<a href="https://arxiv.org/pdf/2005.09137">pdf</a>, <a href="https://arxiv.org/format/2005.09137">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Weak-Attention Suppression For Transformer Based Speech Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yangyang Shi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yongqiang Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+C">Chunyang Wu</a>, <a href="/search/cs?searchtype=author&query=Fuegen%2C+C">Christian Fuegen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+F">Frank Zhang</a>, <a href="/search/cs?searchtype=author&query=Le%2C+D">Duc Le</a>, <a href="/search/cs?searchtype=author&query=Yeh%2C+C">Ching-Feng Yeh</a>, <a href="/search/cs?searchtype=author&query=Seltzer%2C+M+L">Michael L. Seltzer</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2005.09137v1-abstract-short" style="display: inline;"> Transformers, originally proposed for natural language processing (NLP) tasks, have recently achieved great success in automatic speech recognition (ASR). However, adjacent acoustic units (i.e., frames) are highly correlated, and long-distance dependencies between them are weak, unlike text units. It suggests that ASR will likely benefit from sparse and localized attention. In this paper, we propo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2005.09137v1-abstract-full').style.display = 'inline'; document.getElementById('2005.09137v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2005.09137v1-abstract-full" style="display: none;"> Transformers, originally proposed for natural language processing (NLP) tasks, have recently achieved great success in automatic speech recognition (ASR). However, adjacent acoustic units (i.e., frames) are highly correlated, and long-distance dependencies between them are weak, unlike text units. It suggests that ASR will likely benefit from sparse and localized attention. In this paper, we propose Weak-Attention Suppression (WAS), a method that dynamically induces sparsity in attention probabilities. We demonstrate that WAS leads to consistent Word Error Rate (WER) improvement over strong transformer baselines. On the widely used LibriSpeech benchmark, our proposed method reduced WER by 10%$ on test-clean and 5% on test-other for streamable transformers, resulting in a new state-of-the-art among streaming models. Further analysis shows that WAS learns to suppress attention of non-critical and redundant continuous acoustic frames, and is more likely to suppress past frames rather than future ones. It indicates the importance of lookahead in attention-based ASR models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2005.09137v1-abstract-full').style.display = 'none'; document.getElementById('2005.09137v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 May, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">submitted to interspeech 2020</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2005.07850">arXiv:2005.07850</a> <span> [<a href="https://arxiv.org/pdf/2005.07850">pdf</a>, <a href="https://arxiv.org/ps/2005.07850">ps</a>, <a href="https://arxiv.org/format/2005.07850">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Large scale weakly and semi-supervised learning for low-resource video ASR </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Singh%2C+K">Kritika Singh</a>, <a href="/search/cs?searchtype=author&query=Manohar%2C+V">Vimal Manohar</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+A">Alex Xiao</a>, <a href="/search/cs?searchtype=author&query=Edunov%2C+S">Sergey Edunov</a>, <a href="/search/cs?searchtype=author&query=Girshick%2C+R">Ross Girshick</a>, <a href="/search/cs?searchtype=author&query=Liptchinsky%2C+V">Vitaliy Liptchinsky</a>, <a href="/search/cs?searchtype=author&query=Fuegen%2C+C">Christian Fuegen</a>, <a href="/search/cs?searchtype=author&query=Saraf%2C+Y">Yatharth Saraf</a>, <a href="/search/cs?searchtype=author&query=Zweig%2C+G">Geoffrey Zweig</a>, <a href="/search/cs?searchtype=author&query=Mohamed%2C+A">Abdelrahman Mohamed</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2005.07850v2-abstract-short" style="display: inline;"> Many semi- and weakly-supervised approaches have been investigated for overcoming the labeling cost of building high quality speech recognition systems. On the challenging task of transcribing social media videos in low-resource conditions, we conduct a large scale systematic comparison between two self-labeling methods on one hand, and weakly-supervised pretraining using contextual metadata on th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2005.07850v2-abstract-full').style.display = 'inline'; document.getElementById('2005.07850v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2005.07850v2-abstract-full" style="display: none;"> Many semi- and weakly-supervised approaches have been investigated for overcoming the labeling cost of building high quality speech recognition systems. On the challenging task of transcribing social media videos in low-resource conditions, we conduct a large scale systematic comparison between two self-labeling methods on one hand, and weakly-supervised pretraining using contextual metadata on the other. We investigate distillation methods at the frame level and the sequence level for hybrid, encoder-only CTC-based, and encoder-decoder speech recognition systems on Dutch and Romanian languages using 27,000 and 58,000 hours of unlabeled audio respectively. Although all approaches improved upon their respective baseline WERs by more than 8%, sequence-level distillation for encoder-decoder models provided the largest relative WER reduction of 20% compared to the strongest data-augmented supervised baseline. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2005.07850v2-abstract-full').style.display = 'none'; document.getElementById('2005.07850v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 August, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 May, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2020. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2002.06758">arXiv:2002.06758</a> <span> [<a href="https://arxiv.org/pdf/2002.06758">pdf</a>, <a href="https://arxiv.org/format/2002.06758">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Interactive Text-to-Speech System via Joint Style Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yang Gao</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+W">Weiyi Zheng</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhaojun Yang</a>, <a href="/search/cs?searchtype=author&query=Kohler%2C+T">Thilo Kohler</a>, <a href="/search/cs?searchtype=author&query=Fuegen%2C+C">Christian Fuegen</a>, <a href="/search/cs?searchtype=author&query=He%2C+Q">Qing He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2002.06758v2-abstract-short" style="display: inline;"> While modern TTS technologies have made significant advancements in audio quality, there is still a lack of behavior naturalness compared to conversing with people. We propose a style-embedded TTS system that generates styled responses based on the speech query style. To achieve this, the system includes a style extraction model that extracts a style embedding from the speech query, which is then… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2002.06758v2-abstract-full').style.display = 'inline'; document.getElementById('2002.06758v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2002.06758v2-abstract-full" style="display: none;"> While modern TTS technologies have made significant advancements in audio quality, there is still a lack of behavior naturalness compared to conversing with people. We propose a style-embedded TTS system that generates styled responses based on the speech query style. To achieve this, the system includes a style extraction model that extracts a style embedding from the speech query, which is then used by the TTS to produce a matching response. We faced two main challenges: 1) only a small portion of the TTS training dataset has style labels, which is needed to train a multi-style TTS that respects different style embeddings during inference. 2) The TTS system and the style extraction model have disjoint training datasets. We need consistent style labels across these two datasets so that the TTS can learn to respect the labels produced by the style extraction model during inference. To solve these, we adopted a semi-supervised approach that uses the style extraction model to create style labels for the TTS dataset and applied transfer learning to learn the style embedding jointly. Our experiment results show user preference for the styled TTS responses and demonstrate the style-embedded TTS system's capability of mimicking the speech query style. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2002.06758v2-abstract-full').style.display = 'none'; document.getElementById('2002.06758v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 September, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 February, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by Interspeech 2020</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1912.07875">arXiv:1912.07875</a> <span> [<a href="https://arxiv.org/pdf/1912.07875">pdf</a>, <a href="https://arxiv.org/ps/1912.07875">ps</a>, <a href="https://arxiv.org/format/1912.07875">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/ICASSP40776.2020.9052942">10.1109/ICASSP40776.2020.9052942 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Libri-Light: A Benchmark for ASR with Limited or No Supervision </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kahn%2C+J">Jacob Kahn</a>, <a href="/search/cs?searchtype=author&query=Rivi%C3%A8re%2C+M">Morgane Rivi猫re</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+W">Weiyi Zheng</a>, <a href="/search/cs?searchtype=author&query=Kharitonov%2C+E">Evgeny Kharitonov</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Q">Qiantong Xu</a>, <a href="/search/cs?searchtype=author&query=Mazar%C3%A9%2C+P">Pierre-Emmanuel Mazar茅</a>, <a href="/search/cs?searchtype=author&query=Karadayi%2C+J">Julien Karadayi</a>, <a href="/search/cs?searchtype=author&query=Liptchinsky%2C+V">Vitaliy Liptchinsky</a>, <a href="/search/cs?searchtype=author&query=Collobert%2C+R">Ronan Collobert</a>, <a href="/search/cs?searchtype=author&query=Fuegen%2C+C">Christian Fuegen</a>, <a href="/search/cs?searchtype=author&query=Likhomanenko%2C+T">Tatiana Likhomanenko</a>, <a href="/search/cs?searchtype=author&query=Synnaeve%2C+G">Gabriel Synnaeve</a>, <a href="/search/cs?searchtype=author&query=Joulin%2C+A">Armand Joulin</a>, <a href="/search/cs?searchtype=author&query=Mohamed%2C+A">Abdelrahman Mohamed</a>, <a href="/search/cs?searchtype=author&query=Dupoux%2C+E">Emmanuel Dupoux</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1912.07875v1-abstract-short" style="display: inline;"> We introduce a new collection of spoken English audio suitable for training speech recognition systems under limited or no supervision. It is derived from open-source audio books from the LibriVox project. It contains over 60K hours of audio, which is, to our knowledge, the largest freely-available corpus of speech. The audio has been segmented using voice activity detection and is tagged with SNR… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1912.07875v1-abstract-full').style.display = 'inline'; document.getElementById('1912.07875v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1912.07875v1-abstract-full" style="display: none;"> We introduce a new collection of spoken English audio suitable for training speech recognition systems under limited or no supervision. It is derived from open-source audio books from the LibriVox project. It contains over 60K hours of audio, which is, to our knowledge, the largest freely-available corpus of speech. The audio has been segmented using voice activity detection and is tagged with SNR, speaker ID and genre descriptions. Additionally, we provide baseline systems and evaluation metrics working under three settings: (1) the zero resource/unsupervised setting (ABX), (2) the semi-supervised setting (PER, CER) and (3) the distant supervision setting (WER). Settings (2) and (3) use limited textual resources (10 minutes to 10 hours) aligned with the speech. Setting (3) uses large amounts of unaligned text. They are evaluated on the standard LibriSpeech dev and test sets for comparison with the supervised state-of-the-art. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1912.07875v1-abstract-full').style.display = 'none'; document.getElementById('1912.07875v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 December, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2019. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1911.02115">arXiv:1911.02115</a> <span> [<a href="https://arxiv.org/pdf/1911.02115">pdf</a>, <a href="https://arxiv.org/ps/1911.02115">ps</a>, <a href="https://arxiv.org/format/1911.02115">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Spatial Attention for Far-field Speech Recognition with Deep Beamforming Neural Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+W">Weipeng He</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+L">Lu Lu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Biqiao Zhang</a>, <a href="/search/cs?searchtype=author&query=Mahadeokar%2C+J">Jay Mahadeokar</a>, <a href="/search/cs?searchtype=author&query=Kalgaonkar%2C+K">Kaustubh Kalgaonkar</a>, <a href="/search/cs?searchtype=author&query=Fuegen%2C+C">Christian Fuegen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1911.02115v2-abstract-short" style="display: inline;"> In this paper, we introduce spatial attention for refining the information in multi-direction neural beamformer for far-field automatic speech recognition. Previous approaches of neural beamformers with multiple look directions, such as the factored complex linear projection, have shown promising results. However, the features extracted by such methods contain redundant information, as only the di… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1911.02115v2-abstract-full').style.display = 'inline'; document.getElementById('1911.02115v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1911.02115v2-abstract-full" style="display: none;"> In this paper, we introduce spatial attention for refining the information in multi-direction neural beamformer for far-field automatic speech recognition. Previous approaches of neural beamformers with multiple look directions, such as the factored complex linear projection, have shown promising results. However, the features extracted by such methods contain redundant information, as only the direction of the target speech is relevant. We propose using a spatial attention subnet to weigh the features from different directions, so that the subsequent acoustic model could focus on the most relevant features for the speech recognition. Our experimental results show that spatial attention achieves up to 9% relative word error rate improvement over methods without the attention. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1911.02115v2-abstract-full').style.display = 'none'; document.getElementById('1911.02115v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 March, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 November, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To be presented at ICASSP 2020</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1911.01629">arXiv:1911.01629</a> <span> [<a href="https://arxiv.org/pdf/1911.01629">pdf</a>, <a href="https://arxiv.org/format/1911.01629">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> RNN-T For Latency Controlled ASR With Improved Beam Search </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jain%2C+M">Mahaveer Jain</a>, <a href="/search/cs?searchtype=author&query=Schubert%2C+K">Kjell Schubert</a>, <a href="/search/cs?searchtype=author&query=Mahadeokar%2C+J">Jay Mahadeokar</a>, <a href="/search/cs?searchtype=author&query=Yeh%2C+C">Ching-Feng Yeh</a>, <a href="/search/cs?searchtype=author&query=Kalgaonkar%2C+K">Kaustubh Kalgaonkar</a>, <a href="/search/cs?searchtype=author&query=Sriram%2C+A">Anuroop Sriram</a>, <a href="/search/cs?searchtype=author&query=Fuegen%2C+C">Christian Fuegen</a>, <a href="/search/cs?searchtype=author&query=Seltzer%2C+M+L">Michael L. Seltzer</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1911.01629v2-abstract-short" style="display: inline;"> Neural transducer-based systems such as RNN Transducers (RNN-T) for automatic speech recognition (ASR) blend the individual components of a traditional hybrid ASR systems (acoustic model, language model, punctuation model, inverse text normalization) into one single model. This greatly simplifies training and inference and hence makes RNN-T a desirable choice for ASR systems. In this work, we inve… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1911.01629v2-abstract-full').style.display = 'inline'; document.getElementById('1911.01629v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1911.01629v2-abstract-full" style="display: none;"> Neural transducer-based systems such as RNN Transducers (RNN-T) for automatic speech recognition (ASR) blend the individual components of a traditional hybrid ASR systems (acoustic model, language model, punctuation model, inverse text normalization) into one single model. This greatly simplifies training and inference and hence makes RNN-T a desirable choice for ASR systems. In this work, we investigate use of RNN-T in applications that require a tune-able latency budget during inference time. We also improved the decoding speed of the originally proposed RNN-T beam search algorithm. We evaluated our proposed system on English videos ASR dataset and show that neural RNN-T models can achieve comparable WER and better computational efficiency compared to a well tuned hybrid ASR baseline. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1911.01629v2-abstract-full').style.display = 'none'; document.getElementById('1911.01629v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 January, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 November, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2019. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1910.12977">arXiv:1910.12977</a> <span> [<a href="https://arxiv.org/pdf/1910.12977">pdf</a>, <a href="https://arxiv.org/format/1910.12977">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Transformer-Transducer: End-to-End Speech Recognition with Self-Attention </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yeh%2C+C">Ching-Feng Yeh</a>, <a href="/search/cs?searchtype=author&query=Mahadeokar%2C+J">Jay Mahadeokar</a>, <a href="/search/cs?searchtype=author&query=Kalgaonkar%2C+K">Kaustubh Kalgaonkar</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yongqiang Wang</a>, <a href="/search/cs?searchtype=author&query=Le%2C+D">Duc Le</a>, <a href="/search/cs?searchtype=author&query=Jain%2C+M">Mahaveer Jain</a>, <a href="/search/cs?searchtype=author&query=Schubert%2C+K">Kjell Schubert</a>, <a href="/search/cs?searchtype=author&query=Fuegen%2C+C">Christian Fuegen</a>, <a href="/search/cs?searchtype=author&query=Seltzer%2C+M+L">Michael L. Seltzer</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1910.12977v1-abstract-short" style="display: inline;"> We explore options to use Transformer networks in neural transducer for end-to-end speech recognition. Transformer networks use self-attention for sequence modeling and comes with advantages in parallel computation and capturing contexts. We propose 1) using VGGNet with causal convolution to incorporate positional information and reduce frame rate for efficient inference 2) using truncated self-at… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1910.12977v1-abstract-full').style.display = 'inline'; document.getElementById('1910.12977v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1910.12977v1-abstract-full" style="display: none;"> We explore options to use Transformer networks in neural transducer for end-to-end speech recognition. Transformer networks use self-attention for sequence modeling and comes with advantages in parallel computation and capturing contexts. We propose 1) using VGGNet with causal convolution to incorporate positional information and reduce frame rate for efficient inference 2) using truncated self-attention to enable streaming for Transformer and reduce computational complexity. All experiments are conducted on the public LibriSpeech corpus. The proposed Transformer-Transducer outperforms neural transducer with LSTM/BLSTM networks and achieved word error rates of 6.37 % on the test-clean set and 15.30 % on the test-other set, while remaining streamable, compact with 45.7M parameters for the entire system, and computationally efficient with complexity of O(T), where T is input sequence length. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1910.12977v1-abstract-full').style.display = 'none'; document.getElementById('1910.12977v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2019. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1910.12612">arXiv:1910.12612</a> <span> [<a href="https://arxiv.org/pdf/1910.12612">pdf</a>, <a href="https://arxiv.org/format/1910.12612">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> G2G: TTS-Driven Pronunciation Learning for Graphemic Hybrid ASR </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Le%2C+D">Duc Le</a>, <a href="/search/cs?searchtype=author&query=Koehler%2C+T">Thilo Koehler</a>, <a href="/search/cs?searchtype=author&query=Fuegen%2C+C">Christian Fuegen</a>, <a href="/search/cs?searchtype=author&query=Seltzer%2C+M+L">Michael L. Seltzer</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1910.12612v2-abstract-short" style="display: inline;"> Grapheme-based acoustic modeling has recently been shown to outperform phoneme-based approaches in both hybrid and end-to-end automatic speech recognition (ASR), even on non-phonemic languages like English. However, graphemic ASR still has problems with rare long-tail words that do not follow the standard spelling conventions seen in training, such as entity names. In this work, we present a novel… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1910.12612v2-abstract-full').style.display = 'inline'; document.getElementById('1910.12612v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1910.12612v2-abstract-full" style="display: none;"> Grapheme-based acoustic modeling has recently been shown to outperform phoneme-based approaches in both hybrid and end-to-end automatic speech recognition (ASR), even on non-phonemic languages like English. However, graphemic ASR still has problems with rare long-tail words that do not follow the standard spelling conventions seen in training, such as entity names. In this work, we present a novel method to train a statistical grapheme-to-grapheme (G2G) model on text-to-speech data that can rewrite an arbitrary character sequence into more phonetically consistent forms. We show that using G2G to provide alternative pronunciations during decoding reduces Word Error Rate by 3% to 11% relative over a strong graphemic baseline and bridges the gap on rare name recognition with an equivalent phonetic setup. Unlike many previously proposed methods, our method does not require any change to the acoustic model training procedure. This work reaffirms the efficacy of grapheme-based modeling and shows that specialized linguistic knowledge, when available, can be leveraged to improve graphemic ASR. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1910.12612v2-abstract-full').style.display = 'none'; document.getElementById('1910.12612v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 October, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To appear at ICASSP 2020</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1910.09799">arXiv:1910.09799</a> <span> [<a href="https://arxiv.org/pdf/1910.09799">pdf</a>, <a href="https://arxiv.org/format/1910.09799">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/ICASSP40776.2020.9054345">10.1109/ICASSP40776.2020.9054345 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Transformer-based Acoustic Modeling for Hybrid Speech Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yongqiang Wang</a>, <a href="/search/cs?searchtype=author&query=Mohamed%2C+A">Abdelrahman Mohamed</a>, <a href="/search/cs?searchtype=author&query=Le%2C+D">Duc Le</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+C">Chunxi Liu</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+A">Alex Xiao</a>, <a href="/search/cs?searchtype=author&query=Mahadeokar%2C+J">Jay Mahadeokar</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+H">Hongzhao Huang</a>, <a href="/search/cs?searchtype=author&query=Tjandra%2C+A">Andros Tjandra</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiaohui Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+F">Frank Zhang</a>, <a href="/search/cs?searchtype=author&query=Fuegen%2C+C">Christian Fuegen</a>, <a href="/search/cs?searchtype=author&query=Zweig%2C+G">Geoffrey Zweig</a>, <a href="/search/cs?searchtype=author&query=Seltzer%2C+M+L">Michael L. Seltzer</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1910.09799v2-abstract-short" style="display: inline;"> We propose and evaluate transformer-based acoustic models (AMs) for hybrid speech recognition. Several modeling choices are discussed in this work, including various positional embedding methods and an iterated loss to enable training deep transformers. We also present a preliminary study of using limited right context in transformer models, which makes it possible for streaming applications. We d… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1910.09799v2-abstract-full').style.display = 'inline'; document.getElementById('1910.09799v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1910.09799v2-abstract-full" style="display: none;"> We propose and evaluate transformer-based acoustic models (AMs) for hybrid speech recognition. Several modeling choices are discussed in this work, including various positional embedding methods and an iterated loss to enable training deep transformers. We also present a preliminary study of using limited right context in transformer models, which makes it possible for streaming applications. We demonstrate that on the widely used Librispeech benchmark, our transformer-based AM outperforms the best published hybrid result by 19% to 26% relative when the standard n-gram language model (LM) is used. Combined with neural network LM for rescoring, our proposed approach achieves state-of-the-art results on Librispeech. Our findings are also confirmed on a much larger internal dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1910.09799v2-abstract-full').style.display = 'none'; document.getElementById('1910.09799v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 April, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 October, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">to appear in ICASSP 2020</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1910.01493">arXiv:1910.01493</a> <span> [<a href="https://arxiv.org/pdf/1910.01493">pdf</a>, <a href="https://arxiv.org/format/1910.01493">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> From Senones to Chenones: Tied Context-Dependent Graphemes for Hybrid Speech Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Le%2C+D">Duc Le</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiaohui Zhang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+W">Weiyi Zheng</a>, <a href="/search/cs?searchtype=author&query=F%C3%BCgen%2C+C">Christian F眉gen</a>, <a href="/search/cs?searchtype=author&query=Zweig%2C+G">Geoffrey Zweig</a>, <a href="/search/cs?searchtype=author&query=Seltzer%2C+M+L">Michael L. Seltzer</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1910.01493v2-abstract-short" style="display: inline;"> There is an implicit assumption that traditional hybrid approaches for automatic speech recognition (ASR) cannot directly model graphemes and need to rely on phonetic lexicons to get competitive performance, especially on English which has poor grapheme-phoneme correspondence. In this work, we show for the first time that, on English, hybrid ASR systems can in fact model graphemes effectively by l… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1910.01493v2-abstract-full').style.display = 'inline'; document.getElementById('1910.01493v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1910.01493v2-abstract-full" style="display: none;"> There is an implicit assumption that traditional hybrid approaches for automatic speech recognition (ASR) cannot directly model graphemes and need to rely on phonetic lexicons to get competitive performance, especially on English which has poor grapheme-phoneme correspondence. In this work, we show for the first time that, on English, hybrid ASR systems can in fact model graphemes effectively by leveraging tied context-dependent graphemes, i.e., chenones. Our chenone-based systems significantly outperform equivalent senone baselines by 4.5% to 11.1% relative on three different English datasets. Our results on Librispeech are state-of-the-art compared to other hybrid approaches and competitive with previously published end-to-end numbers. Further analysis shows that chenones can better utilize powerful acoustic models and large training data, and require context- and position-dependent modeling to work well. Chenone-based systems also outperform senone baselines on proper noun and rare word recognition, an area where the latter is traditionally thought to have an advantage. Our work provides an alternative for end-to-end ASR and establishes that hybrid systems can be improved by dropping the reliance on phonetic knowledge. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1910.01493v2-abstract-full').style.display = 'none'; document.getElementById('1910.01493v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 October, 2019; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 October, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To appear at ASRU 2019</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1812.02142">arXiv:1812.02142</a> <span> [<a href="https://arxiv.org/pdf/1812.02142">pdf</a>, <a href="https://arxiv.org/format/1812.02142">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> End-to-end contextual speech recognition using class language models and a token passing decoder </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhehuai Chen</a>, <a href="/search/cs?searchtype=author&query=Jain%2C+M">Mahaveer Jain</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yongqiang Wang</a>, <a href="/search/cs?searchtype=author&query=Seltzer%2C+M+L">Michael L. Seltzer</a>, <a href="/search/cs?searchtype=author&query=Fuegen%2C+C">Christian Fuegen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1812.02142v1-abstract-short" style="display: inline;"> End-to-end modeling (E2E) of automatic speech recognition (ASR) blends all the components of a traditional speech recognition system into a unified model. Although it simplifies training and decoding pipelines, the unified model is hard to adapt when mismatch exists between training and test data. In this work, we focus on contextual speech recognition, which is particularly challenging for E2E mo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1812.02142v1-abstract-full').style.display = 'inline'; document.getElementById('1812.02142v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1812.02142v1-abstract-full" style="display: none;"> End-to-end modeling (E2E) of automatic speech recognition (ASR) blends all the components of a traditional speech recognition system into a unified model. Although it simplifies training and decoding pipelines, the unified model is hard to adapt when mismatch exists between training and test data. In this work, we focus on contextual speech recognition, which is particularly challenging for E2E models because it introduces significant mismatch between training and test data. To improve the performance in the presence of complex contextual information, we propose to use class-based language models(CLM) that can populate the classes with contextdependent information in real-time. To enable this approach to scale to a large number of class members and minimize search errors, we propose a token passing decoder with efficient token recombination for E2E systems for the first time. We evaluate the proposed system on general and contextual ASR, and achieve relative 62% Word Error Rate(WER) reduction for contextual ASR without hurting performance for general ASR. We show that the proposed method performs well without modification of the decoding hyper-parameters across tasks, making it a general solution for E2E ASR. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1812.02142v1-abstract-full').style.display = 'none'; document.getElementById('1812.02142v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 December, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2018. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">submit to ICASSP2019</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68T10 <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.7 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1802.08395">arXiv:1802.08395</a> <span> [<a href="https://arxiv.org/pdf/1802.08395">pdf</a>, <a href="https://arxiv.org/format/1802.08395">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Towards end-to-end spoken language understanding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Serdyuk%2C+D">Dmitriy Serdyuk</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yongqiang Wang</a>, <a href="/search/cs?searchtype=author&query=Fuegen%2C+C">Christian Fuegen</a>, <a href="/search/cs?searchtype=author&query=Kumar%2C+A">Anuj Kumar</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+B">Baiyang Liu</a>, <a href="/search/cs?searchtype=author&query=Bengio%2C+Y">Yoshua Bengio</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1802.08395v1-abstract-short" style="display: inline;"> Spoken language understanding system is traditionally designed as a pipeline of a number of components. First, the audio signal is processed by an automatic speech recognizer for transcription or n-best hypotheses. With the recognition results, a natural language understanding system classifies the text to structured data as domain, intent and slots for down-streaming consumers, such as dialog sys… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1802.08395v1-abstract-full').style.display = 'inline'; document.getElementById('1802.08395v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1802.08395v1-abstract-full" style="display: none;"> Spoken language understanding system is traditionally designed as a pipeline of a number of components. First, the audio signal is processed by an automatic speech recognizer for transcription or n-best hypotheses. With the recognition results, a natural language understanding system classifies the text to structured data as domain, intent and slots for down-streaming consumers, such as dialog system, hands-free applications. These components are usually developed and optimized independently. In this paper, we present our study on an end-to-end learning system for spoken language understanding. With this unified approach, we can infer the semantic meaning directly from audio features without the intermediate text representation. This study showed that the trained model can achieve reasonable good result and demonstrated that the model can capture the semantic attention directly from the audio features. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1802.08395v1-abstract-full').style.display = 'none'; document.getElementById('1802.08395v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 February, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2018. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">submitted to ICASSP 2018</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1711.01369">arXiv:1711.01369</a> <span> [<a href="https://arxiv.org/pdf/1711.01369">pdf</a>, <a href="https://arxiv.org/format/1711.01369">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Knowledge Transfer from Weakly Labeled Audio using Convolutional Neural Network for Sound Events and Scenes </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kumar%2C+A">Anurag Kumar</a>, <a href="/search/cs?searchtype=author&query=Khadkevich%2C+M">Maksim Khadkevich</a>, <a href="/search/cs?searchtype=author&query=Fugen%2C+C">Christian Fugen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1711.01369v4-abstract-short" style="display: inline;"> In this work we propose approaches to effectively transfer knowledge from weakly labeled web audio data. We first describe a convolutional neural network (CNN) based framework for sound event detection and classification using weakly labeled audio data. Our model trains efficiently from audios of variable lengths; hence, it is well suited for transfer learning. We then propose methods to learn rep… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1711.01369v4-abstract-full').style.display = 'inline'; document.getElementById('1711.01369v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1711.01369v4-abstract-full" style="display: none;"> In this work we propose approaches to effectively transfer knowledge from weakly labeled web audio data. We first describe a convolutional neural network (CNN) based framework for sound event detection and classification using weakly labeled audio data. Our model trains efficiently from audios of variable lengths; hence, it is well suited for transfer learning. We then propose methods to learn representations using this model which can be effectively used for solving the target task. We study both transductive and inductive transfer learning tasks, showing the effectiveness of our methods for both domain and task adaptation. We show that the learned representations using the proposed CNN model generalizes well enough to reach human level accuracy on ESC-50 sound events dataset and set state of art results on this dataset. We further use them for acoustic scene classification task and once again show that our proposed approaches suit well for this task as well. We also show that our methods are helpful in capturing semantic meanings and relations as well. Moreover, in this process we also set state-of-art results on Audioset dataset, relying on balanced training set. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1711.01369v4-abstract-full').style.display = 'none'; document.getElementById('1711.01369v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 September, 2018; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 November, 2017; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2017. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICASSP 2018</span> </p> </li> </ol> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository