CINXE.COM

Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1&ndash;23 of 23 results for author: <span class="mathjax">Kashiwagi, Y</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> <div class="content"> <form method="GET" action="/search/eess" aria-role="search"> Searching in archive <strong>eess</strong>. <a href="/search/?searchtype=author&amp;query=Kashiwagi%2C+Y">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Kashiwagi, Y"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Kashiwagi%2C+Y&amp;terms-0-field=author&amp;size=50&amp;order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Kashiwagi, Y"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.15732">arXiv:2409.15732</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.15732">pdf</a>, <a href="https://arxiv.org/format/2409.15732">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Hypothesis Clustering and Merging: Novel MultiTalker Speech Recognition with Speaker Tokens </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Kashiwagi%2C+Y">Yosuke Kashiwagi</a>, <a href="/search/eess?searchtype=author&amp;query=Futami%2C+H">Hayato Futami</a>, <a href="/search/eess?searchtype=author&amp;query=Tsunoo%2C+E">Emiru Tsunoo</a>, <a href="/search/eess?searchtype=author&amp;query=Arora%2C+S">Siddhant Arora</a>, <a href="/search/eess?searchtype=author&amp;query=Watanabe%2C+S">Shinji Watanabe</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.15732v1-abstract-short" style="display: inline;"> In many real-world scenarios, such as meetings, multiple speakers are present with an unknown number of participants, and their utterances often overlap. We address these multi-speaker challenges by a novel attention-based encoder-decoder method augmented with special speaker class tokens obtained by speaker clustering. During inference, we select multiple recognition hypotheses conditioned on pre&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15732v1-abstract-full').style.display = 'inline'; document.getElementById('2409.15732v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.15732v1-abstract-full" style="display: none;"> In many real-world scenarios, such as meetings, multiple speakers are present with an unknown number of participants, and their utterances often overlap. We address these multi-speaker challenges by a novel attention-based encoder-decoder method augmented with special speaker class tokens obtained by speaker clustering. During inference, we select multiple recognition hypotheses conditioned on predicted speaker cluster tokens, and these hypotheses are merged by agglomerative hierarchical clustering (AHC) based on the normalized edit distance. The clustered hypotheses result in the multi-speaker transcriptions with the appropriate number of speakers determined by AHC. Our experiments on the LibriMix dataset demonstrate that our proposed method was particularly effective in complex 3-mix environments, achieving a 55% relative error reduction on clean data and a 36% relative error reduction on noisy data compared with conventional serialized output training. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15732v1-abstract-full').style.display = 'none'; document.getElementById('2409.15732v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to ICASSP 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.16107">arXiv:2406.16107</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.16107">pdf</a>, <a href="https://arxiv.org/ps/2406.16107">ps</a>, <a href="https://arxiv.org/format/2406.16107">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Decoder-only Architecture for Streaming End-to-end Speech Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Tsunoo%2C+E">Emiru Tsunoo</a>, <a href="/search/eess?searchtype=author&amp;query=Futami%2C+H">Hayato Futami</a>, <a href="/search/eess?searchtype=author&amp;query=Kashiwagi%2C+Y">Yosuke Kashiwagi</a>, <a href="/search/eess?searchtype=author&amp;query=Arora%2C+S">Siddhant Arora</a>, <a href="/search/eess?searchtype=author&amp;query=Watanabe%2C+S">Shinji Watanabe</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.16107v2-abstract-short" style="display: inline;"> Decoder-only language models (LMs) have been successfully adopted for speech-processing tasks including automatic speech recognition (ASR). The LMs have ample expressiveness and perform efficiently. This efficiency is a suitable characteristic for streaming applications of ASR. In this work, we propose to use a decoder-only architecture for blockwise streaming ASR. In our approach, speech features&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.16107v2-abstract-full').style.display = 'inline'; document.getElementById('2406.16107v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.16107v2-abstract-full" style="display: none;"> Decoder-only language models (LMs) have been successfully adopted for speech-processing tasks including automatic speech recognition (ASR). The LMs have ample expressiveness and perform efficiently. This efficiency is a suitable characteristic for streaming applications of ASR. In this work, we propose to use a decoder-only architecture for blockwise streaming ASR. In our approach, speech features are compressed using CTC output and context embedding using blockwise speech subnetwork, and are sequentially provided as prompts to the decoder. The decoder estimates the output tokens promptly at each block. To this end, we also propose a novel training scheme using random-length prefix prompts to make the model robust to the truncated prompts caused by blockwise processing. An experimental comparison shows that our proposed decoder-only streaming ASR achieves 8% relative word error rate reduction in the LibriSpeech test-other set while being twice as fast as the baseline model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.16107v2-abstract-full').style.display = 'none'; document.getElementById('2406.16107v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted for Interspeech 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.12611">arXiv:2406.12611</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.12611">pdf</a>, <a href="https://arxiv.org/format/2406.12611">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Rapid Language Adaptation for Multilingual E2E Speech Recognition Using Encoder Prompting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Kashiwagi%2C+Y">Yosuke Kashiwagi</a>, <a href="/search/eess?searchtype=author&amp;query=Futami%2C+H">Hayato Futami</a>, <a href="/search/eess?searchtype=author&amp;query=Tsunoo%2C+E">Emiru Tsunoo</a>, <a href="/search/eess?searchtype=author&amp;query=Arora%2C+S">Siddhant Arora</a>, <a href="/search/eess?searchtype=author&amp;query=Watanabe%2C+S">Shinji Watanabe</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.12611v1-abstract-short" style="display: inline;"> End-to-end multilingual speech recognition models handle multiple languages through a single model, often incorporating language identification to automatically detect the language of incoming speech. Since the common scenario is where the language is already known, these models can perform as language-specific by using language information as prompts, which is particularly beneficial for attentio&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.12611v1-abstract-full').style.display = 'inline'; document.getElementById('2406.12611v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.12611v1-abstract-full" style="display: none;"> End-to-end multilingual speech recognition models handle multiple languages through a single model, often incorporating language identification to automatically detect the language of incoming speech. Since the common scenario is where the language is already known, these models can perform as language-specific by using language information as prompts, which is particularly beneficial for attention-based encoder-decoder architectures. However, the Connectionist Temporal Classification (CTC) approach, which enhances recognition via joint decoding and multi-task training, does not normally incorporate language prompts due to its conditionally independent output tokens. To overcome this, we introduce an encoder prompting technique within the self-conditioned CTC framework, enabling language-specific adaptation of the CTC model in a zero-shot manner. Our method has shown to significantly reduce errors by 28% on average and by 41% on low-resource languages. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.12611v1-abstract-full').style.display = 'none'; document.getElementById('2406.12611v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by INTERSPEECH 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.12317">arXiv:2406.12317</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.12317">pdf</a>, <a href="https://arxiv.org/format/2406.12317">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Finding Task-specific Subnetworks in Multi-task Spoken Language Understanding Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Futami%2C+H">Hayato Futami</a>, <a href="/search/eess?searchtype=author&amp;query=Arora%2C+S">Siddhant Arora</a>, <a href="/search/eess?searchtype=author&amp;query=Kashiwagi%2C+Y">Yosuke Kashiwagi</a>, <a href="/search/eess?searchtype=author&amp;query=Tsunoo%2C+E">Emiru Tsunoo</a>, <a href="/search/eess?searchtype=author&amp;query=Watanabe%2C+S">Shinji Watanabe</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.12317v1-abstract-short" style="display: inline;"> Recently, multi-task spoken language understanding (SLU) models have emerged, designed to address various speech processing tasks. However, these models often rely on a large number of parameters. Also, they often encounter difficulties in adapting to new data for a specific task without experiencing catastrophic forgetting of previously trained tasks. In this study, we propose finding task-specif&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.12317v1-abstract-full').style.display = 'inline'; document.getElementById('2406.12317v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.12317v1-abstract-full" style="display: none;"> Recently, multi-task spoken language understanding (SLU) models have emerged, designed to address various speech processing tasks. However, these models often rely on a large number of parameters. Also, they often encounter difficulties in adapting to new data for a specific task without experiencing catastrophic forgetting of previously trained tasks. In this study, we propose finding task-specific subnetworks within a multi-task SLU model via neural network pruning. In addition to model compression, we expect that the forgetting of previously trained tasks can be mitigated by updating only a task-specific subnetwork. We conduct experiments on top of the state-of-the-art multi-task SLU model ``UniverSLU&#39;&#39;, trained for several tasks such as emotion recognition (ER), intent classification (IC), and automatic speech recognition (ASR). We show that pruned models were successful in adapting to additional ASR or IC data with minimal performance degradation on previously trained tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.12317v1-abstract-full').style.display = 'none'; document.getElementById('2406.12317v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to Interspeech2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.09582">arXiv:2312.09582</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2312.09582">pdf</a>, <a href="https://arxiv.org/format/2312.09582">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Phoneme-aware Encoding for Prefix-tree-based Contextual ASR </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Futami%2C+H">Hayato Futami</a>, <a href="/search/eess?searchtype=author&amp;query=Tsunoo%2C+E">Emiru Tsunoo</a>, <a href="/search/eess?searchtype=author&amp;query=Kashiwagi%2C+Y">Yosuke Kashiwagi</a>, <a href="/search/eess?searchtype=author&amp;query=Ogawa%2C+H">Hiroaki Ogawa</a>, <a href="/search/eess?searchtype=author&amp;query=Arora%2C+S">Siddhant Arora</a>, <a href="/search/eess?searchtype=author&amp;query=Watanabe%2C+S">Shinji Watanabe</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.09582v1-abstract-short" style="display: inline;"> In speech recognition applications, it is important to recognize context-specific rare words, such as proper nouns. Tree-constrained Pointer Generator (TCPGen) has shown promise for this purpose, which efficiently biases such words with a prefix tree. While the original TCPGen relies on grapheme-based encoding, we propose extending it with phoneme-aware encoding to better recognize words of unusua&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.09582v1-abstract-full').style.display = 'inline'; document.getElementById('2312.09582v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.09582v1-abstract-full" style="display: none;"> In speech recognition applications, it is important to recognize context-specific rare words, such as proper nouns. Tree-constrained Pointer Generator (TCPGen) has shown promise for this purpose, which efficiently biases such words with a prefix tree. While the original TCPGen relies on grapheme-based encoding, we propose extending it with phoneme-aware encoding to better recognize words of unusual pronunciations. As TCPGen handles biasing words as subword units, we propose obtaining subword-level phoneme-aware encoding by using alignment between phonemes and subwords. Furthermore, we propose injecting phoneme-level predictions from CTC into queries of TCPGen so that the model better interprets the phoneme-aware encodings. We conducted ASR experiments with TCPGen for RNN transducer. We observed that proposed phoneme-aware encoding outperformed ordinary grapheme-based encoding on both the English LibriSpeech and Japanese CSJ datasets, demonstrating the robustness of our approach across linguistically diverse languages. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.09582v1-abstract-full').style.display = 'none'; document.getElementById('2312.09582v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ICASSP2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.02973">arXiv:2310.02973</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2310.02973">pdf</a>, <a href="https://arxiv.org/format/2310.02973">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> UniverSLU: Universal Spoken Language Understanding for Diverse Tasks with Natural Language Instructions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Arora%2C+S">Siddhant Arora</a>, <a href="/search/eess?searchtype=author&amp;query=Futami%2C+H">Hayato Futami</a>, <a href="/search/eess?searchtype=author&amp;query=Jung%2C+J">Jee-weon Jung</a>, <a href="/search/eess?searchtype=author&amp;query=Peng%2C+Y">Yifan Peng</a>, <a href="/search/eess?searchtype=author&amp;query=Sharma%2C+R">Roshan Sharma</a>, <a href="/search/eess?searchtype=author&amp;query=Kashiwagi%2C+Y">Yosuke Kashiwagi</a>, <a href="/search/eess?searchtype=author&amp;query=Tsunoo%2C+E">Emiru Tsunoo</a>, <a href="/search/eess?searchtype=author&amp;query=Livescu%2C+K">Karen Livescu</a>, <a href="/search/eess?searchtype=author&amp;query=Watanabe%2C+S">Shinji Watanabe</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.02973v2-abstract-short" style="display: inline;"> Recent studies leverage large language models with multi-tasking capabilities, using natural language prompts to guide the model&#39;s behavior and surpassing performance of task-specific models. Motivated by this, we ask: can we build a single model that jointly performs various spoken language understanding (SLU) tasks? We start by adapting a pre-trained automatic speech recognition model to additio&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.02973v2-abstract-full').style.display = 'inline'; document.getElementById('2310.02973v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.02973v2-abstract-full" style="display: none;"> Recent studies leverage large language models with multi-tasking capabilities, using natural language prompts to guide the model&#39;s behavior and surpassing performance of task-specific models. Motivated by this, we ask: can we build a single model that jointly performs various spoken language understanding (SLU) tasks? We start by adapting a pre-trained automatic speech recognition model to additional tasks using single-token task specifiers. We enhance this approach through instruction tuning, i.e., finetuning by describing the task using natural language instructions followed by the list of label options. Our approach can generalize to new task descriptions for the seen tasks during inference, thereby enhancing its user-friendliness. We demonstrate the efficacy of our single multi-task learning model &#34;UniverSLU&#34; for 12 speech classification and sequence generation task types spanning 17 datasets and 9 languages. On most tasks, UniverSLU achieves competitive performance and often even surpasses task-specific models. Additionally, we assess the zero-shot capabilities, finding that the model generalizes to new datasets and languages for seen task types. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.02973v2-abstract-full').style.display = 'none'; document.getElementById('2310.02973v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at NAACL 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.08876">arXiv:2309.08876</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2309.08876">pdf</a>, <a href="https://arxiv.org/ps/2309.08876">ps</a>, <a href="https://arxiv.org/format/2309.08876">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Decoder-only Architecture for Speech Recognition with CTC Prompts and Text Data Augmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Tsunoo%2C+E">Emiru Tsunoo</a>, <a href="/search/eess?searchtype=author&amp;query=Futami%2C+H">Hayato Futami</a>, <a href="/search/eess?searchtype=author&amp;query=Kashiwagi%2C+Y">Yosuke Kashiwagi</a>, <a href="/search/eess?searchtype=author&amp;query=Arora%2C+S">Siddhant Arora</a>, <a href="/search/eess?searchtype=author&amp;query=Watanabe%2C+S">Shinji Watanabe</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.08876v2-abstract-short" style="display: inline;"> Collecting audio-text pairs is expensive; however, it is much easier to access text-only data. Unless using shallow fusion, end-to-end automatic speech recognition (ASR) models require architecture modifications or additional training schemes to use text-only data. Inspired by recent advances in decoder-only language models (LMs), such as GPT-3 and PaLM adopted for speech-processing tasks, we prop&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.08876v2-abstract-full').style.display = 'inline'; document.getElementById('2309.08876v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.08876v2-abstract-full" style="display: none;"> Collecting audio-text pairs is expensive; however, it is much easier to access text-only data. Unless using shallow fusion, end-to-end automatic speech recognition (ASR) models require architecture modifications or additional training schemes to use text-only data. Inspired by recent advances in decoder-only language models (LMs), such as GPT-3 and PaLM adopted for speech-processing tasks, we propose using a decoder-only architecture for ASR with simple text augmentation. To provide audio information, encoder features compressed by CTC prediction are used as prompts for the decoder, which can be regarded as refining CTC prediction using the decoder-only model. Because the decoder architecture is the same as an autoregressive LM, it is simple to enhance the model by leveraging external text data with LM training. An experimental comparison using LibriSpeech and Switchboard shows that our proposed models with text augmentation training reduced word error rates from ordinary CTC by 0.3% and 1.4% on LibriSpeech test-clean and testother set, respectively, and 2.9% and 5.0% on Switchboard and CallHome. The proposed model had advantage on computational efficiency compared with conventional encoder-decoder ASR models with a similar parameter setup, and outperformed them on the LibriSpeech 100h and Switchboard training scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.08876v2-abstract-full').style.display = 'none'; document.getElementById('2309.08876v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2307.12767">arXiv:2307.12767</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2307.12767">pdf</a>, <a href="https://arxiv.org/ps/2307.12767">ps</a>, <a href="https://arxiv.org/format/2307.12767">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Integration of Frame- and Label-synchronous Beam Search for Streaming Encoder-decoder Speech Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Tsunoo%2C+E">Emiru Tsunoo</a>, <a href="/search/eess?searchtype=author&amp;query=Futami%2C+H">Hayato Futami</a>, <a href="/search/eess?searchtype=author&amp;query=Kashiwagi%2C+Y">Yosuke Kashiwagi</a>, <a href="/search/eess?searchtype=author&amp;query=Arora%2C+S">Siddhant Arora</a>, <a href="/search/eess?searchtype=author&amp;query=Watanabe%2C+S">Shinji Watanabe</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2307.12767v1-abstract-short" style="display: inline;"> Although frame-based models, such as CTC and transducers, have an affinity for streaming automatic speech recognition, their decoding uses no future knowledge, which could lead to incorrect pruning. Conversely, label-based attention encoder-decoder mitigates this issue using soft attention to the input, while it tends to overestimate labels biased towards its training domain, unlike CTC. We exploi&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.12767v1-abstract-full').style.display = 'inline'; document.getElementById('2307.12767v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2307.12767v1-abstract-full" style="display: none;"> Although frame-based models, such as CTC and transducers, have an affinity for streaming automatic speech recognition, their decoding uses no future knowledge, which could lead to incorrect pruning. Conversely, label-based attention encoder-decoder mitigates this issue using soft attention to the input, while it tends to overestimate labels biased towards its training domain, unlike CTC. We exploit these complementary attributes and propose to integrate the frame- and label-synchronous (F-/L-Sync) decoding alternately performed within a single beam-search scheme. F-Sync decoding leads the decoding for block-wise processing, while L-Sync decoding provides the prioritized hypotheses using look-ahead future frames within a block. We maintain the hypotheses from both decoding methods to perform effective pruning. Experiments demonstrate that the proposed search algorithm achieves lower error rates compared to the other search methods, while being robust against out-of-domain situations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.12767v1-abstract-full').style.display = 'none'; document.getElementById('2307.12767v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted for Interspeech 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2307.11005">arXiv:2307.11005</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2307.11005">pdf</a>, <a href="https://arxiv.org/format/2307.11005">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Integrating Pretrained ASR and LM to Perform Sequence Generation for Spoken Language Understanding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Arora%2C+S">Siddhant Arora</a>, <a href="/search/eess?searchtype=author&amp;query=Futami%2C+H">Hayato Futami</a>, <a href="/search/eess?searchtype=author&amp;query=Kashiwagi%2C+Y">Yosuke Kashiwagi</a>, <a href="/search/eess?searchtype=author&amp;query=Tsunoo%2C+E">Emiru Tsunoo</a>, <a href="/search/eess?searchtype=author&amp;query=Yan%2C+B">Brian Yan</a>, <a href="/search/eess?searchtype=author&amp;query=Watanabe%2C+S">Shinji Watanabe</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2307.11005v1-abstract-short" style="display: inline;"> There has been an increased interest in the integration of pretrained speech recognition (ASR) and language models (LM) into the SLU framework. However, prior methods often struggle with a vocabulary mismatch between pretrained models, and LM cannot be directly utilized as they diverge from its NLU formulation. In this study, we propose a three-pass end-to-end (E2E) SLU system that effectively int&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.11005v1-abstract-full').style.display = 'inline'; document.getElementById('2307.11005v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2307.11005v1-abstract-full" style="display: none;"> There has been an increased interest in the integration of pretrained speech recognition (ASR) and language models (LM) into the SLU framework. However, prior methods often struggle with a vocabulary mismatch between pretrained models, and LM cannot be directly utilized as they diverge from its NLU formulation. In this study, we propose a three-pass end-to-end (E2E) SLU system that effectively integrates ASR and LM subnetworks into the SLU formulation for sequence generation tasks. In the first pass, our architecture predicts ASR transcripts using the ASR subnetwork. This is followed by the LM subnetwork, which makes an initial SLU prediction. Finally, in the third pass, the deliberation subnetwork conditions on representations from the ASR and LM subnetworks to make the final prediction. Our proposed three-pass SLU system shows improved performance over cascaded and E2E SLU models on two benchmark SLU datasets, SLURP and SLUE, especially on acoustically challenging utterances. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.11005v1-abstract-full').style.display = 'none'; document.getElementById('2307.11005v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at INTERSPEECH 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.01247">arXiv:2306.01247</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2306.01247">pdf</a>, <a href="https://arxiv.org/format/2306.01247">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Tensor decomposition for minimization of E2E SLU model toward on-device processing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Kashiwagi%2C+Y">Yosuke Kashiwagi</a>, <a href="/search/eess?searchtype=author&amp;query=Arora%2C+S">Siddhant Arora</a>, <a href="/search/eess?searchtype=author&amp;query=Futami%2C+H">Hayato Futami</a>, <a href="/search/eess?searchtype=author&amp;query=Huynh%2C+J">Jessica Huynh</a>, <a href="/search/eess?searchtype=author&amp;query=Wu%2C+S">Shih-Lun Wu</a>, <a href="/search/eess?searchtype=author&amp;query=Peng%2C+Y">Yifan Peng</a>, <a href="/search/eess?searchtype=author&amp;query=Yan%2C+B">Brian Yan</a>, <a href="/search/eess?searchtype=author&amp;query=Tsunoo%2C+E">Emiru Tsunoo</a>, <a href="/search/eess?searchtype=author&amp;query=Watanabe%2C+S">Shinji Watanabe</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.01247v1-abstract-short" style="display: inline;"> Spoken Language Understanding (SLU) is a critical speech recognition application and is often deployed on edge devices. Consequently, on-device processing plays a significant role in the practical implementation of SLU. This paper focuses on the end-to-end (E2E) SLU model due to its small latency property, unlike a cascade system, and aims to minimize the computational cost. We reduce the model si&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.01247v1-abstract-full').style.display = 'inline'; document.getElementById('2306.01247v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.01247v1-abstract-full" style="display: none;"> Spoken Language Understanding (SLU) is a critical speech recognition application and is often deployed on edge devices. Consequently, on-device processing plays a significant role in the practical implementation of SLU. This paper focuses on the end-to-end (E2E) SLU model due to its small latency property, unlike a cascade system, and aims to minimize the computational cost. We reduce the model size by applying tensor decomposition to the Conformer and E-Branchformer architectures used in our E2E SLU models. We propose to apply singular value decomposition to linear layers and the Tucker decomposition to convolution layers, respectively. We also compare COMP/PARFAC decomposition and Tensor-Train decomposition to the Tucker decomposition. Since the E2E model is represented by a single neural network, our tensor decomposition can flexibly control the number of parameters without changing feature dimensions. On the STOP dataset, we achieved 70.9% exact match accuracy under the tight constraint of only 15 million parameters. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.01247v1-abstract-full').style.display = 'none'; document.getElementById('2306.01247v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by INTERSPEECH 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.01620">arXiv:2305.01620</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2305.01620">pdf</a>, <a href="https://arxiv.org/ps/2305.01620">ps</a>, <a href="https://arxiv.org/format/2305.01620">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> A Study on the Integration of Pipeline and E2E SLU systems for Spoken Semantic Parsing toward STOP Quality Challenge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Arora%2C+S">Siddhant Arora</a>, <a href="/search/eess?searchtype=author&amp;query=Futami%2C+H">Hayato Futami</a>, <a href="/search/eess?searchtype=author&amp;query=Wu%2C+S">Shih-Lun Wu</a>, <a href="/search/eess?searchtype=author&amp;query=Huynh%2C+J">Jessica Huynh</a>, <a href="/search/eess?searchtype=author&amp;query=Peng%2C+Y">Yifan Peng</a>, <a href="/search/eess?searchtype=author&amp;query=Kashiwagi%2C+Y">Yosuke Kashiwagi</a>, <a href="/search/eess?searchtype=author&amp;query=Tsunoo%2C+E">Emiru Tsunoo</a>, <a href="/search/eess?searchtype=author&amp;query=Yan%2C+B">Brian Yan</a>, <a href="/search/eess?searchtype=author&amp;query=Watanabe%2C+S">Shinji Watanabe</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.01620v2-abstract-short" style="display: inline;"> Recently there have been efforts to introduce new benchmark tasks for spoken language understanding (SLU), like semantic parsing. In this paper, we describe our proposed spoken semantic parsing system for the quality track (Track 1) in Spoken Language Understanding Grand Challenge which is part of ICASSP Signal Processing Grand Challenge 2023. We experiment with both end-to-end and pipeline system&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.01620v2-abstract-full').style.display = 'inline'; document.getElementById('2305.01620v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.01620v2-abstract-full" style="display: none;"> Recently there have been efforts to introduce new benchmark tasks for spoken language understanding (SLU), like semantic parsing. In this paper, we describe our proposed spoken semantic parsing system for the quality track (Track 1) in Spoken Language Understanding Grand Challenge which is part of ICASSP Signal Processing Grand Challenge 2023. We experiment with both end-to-end and pipeline systems for this task. Strong automatic speech recognition (ASR) models like Whisper and pretrained Language models (LM) like BART are utilized inside our SLU framework to boost performance. We also investigate the output level combination of various models to get an exact match accuracy of 80.8, which won the 1st place at the challenge. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.01620v2-abstract-full').style.display = 'none'; document.getElementById('2305.01620v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">First Place in Track 1 of STOP Challenge, which is part of ICASSP Signal Processing Grand Challenge 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.01194">arXiv:2305.01194</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2305.01194">pdf</a>, <a href="https://arxiv.org/ps/2305.01194">ps</a>, <a href="https://arxiv.org/format/2305.01194">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> The Pipeline System of ASR and NLU with MLM-based Data Augmentation toward STOP Low-resource Challenge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Futami%2C+H">Hayato Futami</a>, <a href="/search/eess?searchtype=author&amp;query=Huynh%2C+J">Jessica Huynh</a>, <a href="/search/eess?searchtype=author&amp;query=Arora%2C+S">Siddhant Arora</a>, <a href="/search/eess?searchtype=author&amp;query=Wu%2C+S">Shih-Lun Wu</a>, <a href="/search/eess?searchtype=author&amp;query=Kashiwagi%2C+Y">Yosuke Kashiwagi</a>, <a href="/search/eess?searchtype=author&amp;query=Peng%2C+Y">Yifan Peng</a>, <a href="/search/eess?searchtype=author&amp;query=Yan%2C+B">Brian Yan</a>, <a href="/search/eess?searchtype=author&amp;query=Tsunoo%2C+E">Emiru Tsunoo</a>, <a href="/search/eess?searchtype=author&amp;query=Watanabe%2C+S">Shinji Watanabe</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.01194v2-abstract-short" style="display: inline;"> This paper describes our system for the low-resource domain adaptation track (Track 3) in Spoken Language Understanding Grand Challenge, which is a part of ICASSP Signal Processing Grand Challenge 2023. In the track, we adopt a pipeline approach of ASR and NLU. For ASR, we fine-tune Whisper for each domain with upsampling. For NLU, we fine-tune BART on all the Track3 data and then on low-resource&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.01194v2-abstract-full').style.display = 'inline'; document.getElementById('2305.01194v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.01194v2-abstract-full" style="display: none;"> This paper describes our system for the low-resource domain adaptation track (Track 3) in Spoken Language Understanding Grand Challenge, which is a part of ICASSP Signal Processing Grand Challenge 2023. In the track, we adopt a pipeline approach of ASR and NLU. For ASR, we fine-tune Whisper for each domain with upsampling. For NLU, we fine-tune BART on all the Track3 data and then on low-resource domain data. We apply masked LM (MLM) -based data augmentation, where some of input tokens and corresponding target labels are replaced using MLM. We also apply a retrieval-based approach, where model input is augmented with similar training samples. As a result, we achieved exact match (EM) accuracy 63.3/75.0 (average: 69.15) for reminder/weather domain, and won the 1st place at the challenge. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.01194v2-abstract-full').style.display = 'none'; document.getElementById('2305.01194v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To appear at ICASSP2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2211.08726">arXiv:2211.08726</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2211.08726">pdf</a>, <a href="https://arxiv.org/format/2211.08726">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Streaming Joint Speech Recognition and Disfluency Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Futami%2C+H">Hayato Futami</a>, <a href="/search/eess?searchtype=author&amp;query=Tsunoo%2C+E">Emiru Tsunoo</a>, <a href="/search/eess?searchtype=author&amp;query=Shibata%2C+K">Kentaro Shibata</a>, <a href="/search/eess?searchtype=author&amp;query=Kashiwagi%2C+Y">Yosuke Kashiwagi</a>, <a href="/search/eess?searchtype=author&amp;query=Okuda%2C+T">Takao Okuda</a>, <a href="/search/eess?searchtype=author&amp;query=Arora%2C+S">Siddhant Arora</a>, <a href="/search/eess?searchtype=author&amp;query=Watanabe%2C+S">Shinji Watanabe</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2211.08726v2-abstract-short" style="display: inline;"> Disfluency detection has mainly been solved in a pipeline approach, as post-processing of speech recognition. In this study, we propose Transformer-based encoder-decoder models that jointly solve speech recognition and disfluency detection, which work in a streaming manner. Compared to pipeline approaches, the joint models can leverage acoustic information that makes disfluency detection robust to&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.08726v2-abstract-full').style.display = 'inline'; document.getElementById('2211.08726v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2211.08726v2-abstract-full" style="display: none;"> Disfluency detection has mainly been solved in a pipeline approach, as post-processing of speech recognition. In this study, we propose Transformer-based encoder-decoder models that jointly solve speech recognition and disfluency detection, which work in a streaming manner. Compared to pipeline approaches, the joint models can leverage acoustic information that makes disfluency detection robust to recognition errors and provide non-verbal clues. Moreover, joint modeling results in low-latency and lightweight inference. We investigate two joint model variants for streaming disfluency detection: a transcript-enriched model and a multi-task model. The transcript-enriched model is trained on text with special tags indicating the starting and ending points of the disfluent part. However, it has problems with latency and standard language model adaptation, which arise from the additional disfluency tags. We propose a multi-task model to solve such problems, which has two output layers at the Transformer decoder; one for speech recognition and the other for disfluency detection. It is modeled to be conditioned on the currently recognized token with an additional token-dependency mechanism. We show that the proposed joint models outperformed a BERT-based pipeline approach in both accuracy and latency, on both the Switchboard and the corpus of spontaneous Japanese. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.08726v2-abstract-full').style.display = 'none'; document.getElementById('2211.08726v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at ICASSP2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2206.07430">arXiv:2206.07430</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2206.07430">pdf</a>, <a href="https://arxiv.org/ps/2206.07430">ps</a>, <a href="https://arxiv.org/format/2206.07430">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Residual Language Model for End-to-end Speech Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Tsunoo%2C+E">Emiru Tsunoo</a>, <a href="/search/eess?searchtype=author&amp;query=Kashiwagi%2C+Y">Yosuke Kashiwagi</a>, <a href="/search/eess?searchtype=author&amp;query=Narisetty%2C+C">Chaitanya Narisetty</a>, <a href="/search/eess?searchtype=author&amp;query=Watanabe%2C+S">Shinji Watanabe</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2206.07430v1-abstract-short" style="display: inline;"> End-to-end automatic speech recognition suffers from adaptation to unknown target domain speech despite being trained with a large amount of paired audio--text data. Recent studies estimate a linguistic bias of the model as the internal language model (LM). To effectively adapt to the target domain, the internal LM is subtracted from the posterior during inference and fused with an external target&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.07430v1-abstract-full').style.display = 'inline'; document.getElementById('2206.07430v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2206.07430v1-abstract-full" style="display: none;"> End-to-end automatic speech recognition suffers from adaptation to unknown target domain speech despite being trained with a large amount of paired audio--text data. Recent studies estimate a linguistic bias of the model as the internal language model (LM). To effectively adapt to the target domain, the internal LM is subtracted from the posterior during inference and fused with an external target-domain LM. However, this fusion complicates the inference and the estimation of the internal LM may not always be accurate. In this paper, we propose a simple external LM fusion method for domain adaptation, which considers the internal LM estimation in its training. We directly model the residual factor of the external and internal LMs, namely the residual LM. To stably train the residual LM, we propose smoothing the estimated internal LM and optimizing it with a combination of cross-entropy and mean-squared-error losses, which consider the statistical behaviors of the internal LM in the target domain data. We experimentally confirmed that the proposed residual LM performs better than the internal LM estimation in most of the cross-domain and intra-domain scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.07430v1-abstract-full').style.display = 'none'; document.getElementById('2206.07430v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 June, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted for Interspeech2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2202.01405">arXiv:2202.01405</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2202.01405">pdf</a>, <a href="https://arxiv.org/format/2202.01405">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Joint Speech Recognition and Audio Captioning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Narisetty%2C+C">Chaitanya Narisetty</a>, <a href="/search/eess?searchtype=author&amp;query=Tsunoo%2C+E">Emiru Tsunoo</a>, <a href="/search/eess?searchtype=author&amp;query=Chang%2C+X">Xuankai Chang</a>, <a href="/search/eess?searchtype=author&amp;query=Kashiwagi%2C+Y">Yosuke Kashiwagi</a>, <a href="/search/eess?searchtype=author&amp;query=Hentschel%2C+M">Michael Hentschel</a>, <a href="/search/eess?searchtype=author&amp;query=Watanabe%2C+S">Shinji Watanabe</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2202.01405v1-abstract-short" style="display: inline;"> Speech samples recorded in both indoor and outdoor environments are often contaminated with secondary audio sources. Most end-to-end monaural speech recognition systems either remove these background sounds using speech enhancement or train noise-robust models. For better model interpretability and holistic understanding, we aim to bring together the growing field of automated audio captioning (AA&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2202.01405v1-abstract-full').style.display = 'inline'; document.getElementById('2202.01405v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2202.01405v1-abstract-full" style="display: none;"> Speech samples recorded in both indoor and outdoor environments are often contaminated with secondary audio sources. Most end-to-end monaural speech recognition systems either remove these background sounds using speech enhancement or train noise-robust models. For better model interpretability and holistic understanding, we aim to bring together the growing field of automated audio captioning (AAC) and the thoroughly studied automatic speech recognition (ASR). The goal of AAC is to generate natural language descriptions of contents in audio samples. We propose several approaches for end-to-end joint modeling of ASR and AAC tasks and demonstrate their advantages over traditional approaches, which model these tasks independently. A major hurdle in evaluating our proposed approach is the lack of labeled audio datasets with both speech transcriptions and audio captions. Therefore we also create a multi-task dataset by mixing the clean speech Wall Street Journal corpus with multiple levels of background noises chosen from the AudioCaps dataset. We also perform extensive experimental evaluation and show improvements of our proposed methods as compared to existing state-of-the-art ASR and AAC methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2202.01405v1-abstract-full').style.display = 'none'; document.getElementById('2202.01405v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 February, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 2 figures. Accepted for ICASSP 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2201.10190">arXiv:2201.10190</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2201.10190">pdf</a>, <a href="https://arxiv.org/ps/2201.10190">ps</a>, <a href="https://arxiv.org/format/2201.10190">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Run-and-back stitch search: novel block synchronous decoding for streaming encoder-decoder ASR </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Tsunoo%2C+E">Emiru Tsunoo</a>, <a href="/search/eess?searchtype=author&amp;query=Narisetty%2C+C">Chaitanya Narisetty</a>, <a href="/search/eess?searchtype=author&amp;query=Hentschel%2C+M">Michael Hentschel</a>, <a href="/search/eess?searchtype=author&amp;query=Kashiwagi%2C+Y">Yosuke Kashiwagi</a>, <a href="/search/eess?searchtype=author&amp;query=Watanabe%2C+S">Shinji Watanabe</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2201.10190v1-abstract-short" style="display: inline;"> A streaming style inference of encoder-decoder automatic speech recognition (ASR) system is important for reducing latency, which is essential for interactive use cases. To this end, we propose a novel blockwise synchronous decoding algorithm with a hybrid approach that combines endpoint prediction and endpoint post-determination. In the endpoint prediction, we compute the expectation of the numbe&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2201.10190v1-abstract-full').style.display = 'inline'; document.getElementById('2201.10190v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2201.10190v1-abstract-full" style="display: none;"> A streaming style inference of encoder-decoder automatic speech recognition (ASR) system is important for reducing latency, which is essential for interactive use cases. To this end, we propose a novel blockwise synchronous decoding algorithm with a hybrid approach that combines endpoint prediction and endpoint post-determination. In the endpoint prediction, we compute the expectation of the number of tokens that are yet to be emitted in the encoder features of the current blocks using the CTC posterior. Based on the expectation value, the decoder predicts the endpoint to realize continuous block synchronization, as a running stitch. Meanwhile, endpoint post-determination probabilistically detects backward jump of the source-target attention, which is caused by the misprediction of endpoints. Then it resumes decoding by discarding those hypotheses, as back stitch. We combine these methods into a hybrid approach, namely run-and-back stitch search, which reduces the computational cost and latency. Evaluations of various ASR tasks show the efficiency of our proposed decoding algorithm, which achieves a latency reduction, for instance in the Librispeech test set from 1487 ms to 821 ms at the 90th percentile, while maintaining a high recognition accuracy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2201.10190v1-abstract-full').style.display = 'none'; document.getElementById('2201.10190v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 January, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted for ICASSP2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2110.05968">arXiv:2110.05968</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2110.05968">pdf</a>, <a href="https://arxiv.org/ps/2110.05968">ps</a>, <a href="https://arxiv.org/format/2110.05968">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Improving Character Error Rate Is Not Equal to Having Clean Speech: Speech Enhancement for ASR Systems with Black-box Acoustic Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Sawata%2C+R">Ryosuke Sawata</a>, <a href="/search/eess?searchtype=author&amp;query=Kashiwagi%2C+Y">Yosuke Kashiwagi</a>, <a href="/search/eess?searchtype=author&amp;query=Takahashi%2C+S">Shusuke Takahashi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2110.05968v2-abstract-short" style="display: inline;"> A deep neural network (DNN)-based speech enhancement (SE) aiming to maximize the performance of an automatic speech recognition (ASR) system is proposed in this paper. In order to optimize the DNN-based SE model in terms of the character error rate (CER), which is one of the metric to evaluate the ASR system and generally non-differentiable, our method uses two DNNs: one for speech processing and&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.05968v2-abstract-full').style.display = 'inline'; document.getElementById('2110.05968v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2110.05968v2-abstract-full" style="display: none;"> A deep neural network (DNN)-based speech enhancement (SE) aiming to maximize the performance of an automatic speech recognition (ASR) system is proposed in this paper. In order to optimize the DNN-based SE model in terms of the character error rate (CER), which is one of the metric to evaluate the ASR system and generally non-differentiable, our method uses two DNNs: one for speech processing and one for mimicking the output CERs derived through an acoustic model (AM). Then both of DNNs are alternately optimized in the training phase. Even if the AM is a black-box, e.g., like one provided by a third-party, the proposed method enables the DNN-based SE model to be optimized in terms of the CER since the DNN mimicking the AM is differentiable. Consequently, it becomes feasible to build CER-centric SE model that has no negative effect, e.g., additional calculation cost and changing network architecture, on the inference phase since our method is merely a training scheme for the existing DNN-based methods. Experimental results show that our method improved CER by 8.8% relative derived through a black-box AM although certain noise levels are kept. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.05968v2-abstract-full').style.display = 'none'; document.getElementById('2110.05968v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 February, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 October, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICASSP 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2106.03419">arXiv:2106.03419</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2106.03419">pdf</a>, <a href="https://arxiv.org/ps/2106.03419">ps</a>, <a href="https://arxiv.org/format/2106.03419">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Data Augmentation Methods for End-to-end Speech Recognition on Distant-Talk Scenarios </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Tsunoo%2C+E">Emiru Tsunoo</a>, <a href="/search/eess?searchtype=author&amp;query=Shibata%2C+K">Kentaro Shibata</a>, <a href="/search/eess?searchtype=author&amp;query=Narisetty%2C+C">Chaitanya Narisetty</a>, <a href="/search/eess?searchtype=author&amp;query=Kashiwagi%2C+Y">Yosuke Kashiwagi</a>, <a href="/search/eess?searchtype=author&amp;query=Watanabe%2C+S">Shinji Watanabe</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2106.03419v1-abstract-short" style="display: inline;"> Although end-to-end automatic speech recognition (E2E ASR) has achieved great performance in tasks that have numerous paired data, it is still challenging to make E2E ASR robust against noisy and low-resource conditions. In this study, we investigated data augmentation methods for E2E ASR in distant-talk scenarios. E2E ASR models are trained on the series of CHiME challenge datasets, which are sui&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2106.03419v1-abstract-full').style.display = 'inline'; document.getElementById('2106.03419v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2106.03419v1-abstract-full" style="display: none;"> Although end-to-end automatic speech recognition (E2E ASR) has achieved great performance in tasks that have numerous paired data, it is still challenging to make E2E ASR robust against noisy and low-resource conditions. In this study, we investigated data augmentation methods for E2E ASR in distant-talk scenarios. E2E ASR models are trained on the series of CHiME challenge datasets, which are suitable tasks for studying robustness against noisy and spontaneous speech. We propose to use three augmentation methods and thier combinations: 1) data augmentation using text-to-speech (TTS) data, 2) cycle-consistent generative adversarial network (Cycle-GAN) augmentation trained to map two different audio characteristics, the one of clean speech and of noisy recordings, to match the testing condition, and 3) pseudo-label augmentation provided by the pretrained ASR module for smoothing label distributions. Experimental results using the CHiME-6/CHiME-4 datasets show that each augmentation method individually improves the accuracy on top of the conventional SpecAugment; further improvements are obtained by combining these approaches. We achieved 4.3\% word error rate (WER) reduction, which was more significant than that of the SpecAugment, when we combine all three augmentations for the CHiME-6 task. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2106.03419v1-abstract-full').style.display = 'none'; document.getElementById('2106.03419v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 June, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted for Interspeech2021</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2102.09168">arXiv:2102.09168</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2102.09168">pdf</a>, <a href="https://arxiv.org/format/2102.09168">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Gaussian Kernelized Self-Attention for Long Sequence Data and Its Application to CTC-based Speech Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Kashiwagi%2C+Y">Yosuke Kashiwagi</a>, <a href="/search/eess?searchtype=author&amp;query=Tsunoo%2C+E">Emiru Tsunoo</a>, <a href="/search/eess?searchtype=author&amp;query=Watanabe%2C+S">Shinji Watanabe</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2102.09168v1-abstract-short" style="display: inline;"> Self-attention (SA) based models have recently achieved significant performance improvements in hybrid and end-to-end automatic speech recognition (ASR) systems owing to their flexible context modeling capability. However, it is also known that the accuracy degrades when applying SA to long sequence data. This is mainly due to the length mismatch between the inference and training data because the&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2102.09168v1-abstract-full').style.display = 'inline'; document.getElementById('2102.09168v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2102.09168v1-abstract-full" style="display: none;"> Self-attention (SA) based models have recently achieved significant performance improvements in hybrid and end-to-end automatic speech recognition (ASR) systems owing to their flexible context modeling capability. However, it is also known that the accuracy degrades when applying SA to long sequence data. This is mainly due to the length mismatch between the inference and training data because the training data are usually divided into short segments for efficient training. To mitigate this mismatch, we propose a new architecture, which is a variant of the Gaussian kernel, which itself is a shift-invariant kernel. First, we mathematically demonstrate that self-attention with shared weight parameters for queries and keys is equivalent to a normalized kernel function. By replacing this kernel function with the proposed Gaussian kernel, the architecture becomes completely shift-invariant with the relative position information embedded using a frame indexing technique. The proposed Gaussian kernelized SA was applied to connectionist temporal classification (CTC) based ASR. An experimental evaluation with the Corpus of Spontaneous Japanese (CSJ) and TEDLIUM 3 benchmarks shows that the proposed SA achieves a significant improvement in accuracy (e.g., from 24.0% WER to 6.0% in CSJ) in long sequence data without any windowing techniques. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2102.09168v1-abstract-full').style.display = 'none'; document.getElementById('2102.09168v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ICASSP2021</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2006.14941">arXiv:2006.14941</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2006.14941">pdf</a>, <a href="https://arxiv.org/ps/2006.14941">ps</a>, <a href="https://arxiv.org/format/2006.14941">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Streaming Transformer ASR with Blockwise Synchronous Beam Search </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Tsunoo%2C+E">Emiru Tsunoo</a>, <a href="/search/eess?searchtype=author&amp;query=Kashiwagi%2C+Y">Yosuke Kashiwagi</a>, <a href="/search/eess?searchtype=author&amp;query=Watanabe%2C+S">Shinji Watanabe</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2006.14941v4-abstract-short" style="display: inline;"> The Transformer self-attention network has shown promising performance as an alternative to recurrent neural networks in end-to-end (E2E) automatic speech recognition (ASR) systems. However, Transformer has a drawback in that the entire input sequence is required to compute both self-attention and source--target attention. In this paper, we propose a novel blockwise synchronous beam search algorit&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2006.14941v4-abstract-full').style.display = 'inline'; document.getElementById('2006.14941v4-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2006.14941v4-abstract-full" style="display: none;"> The Transformer self-attention network has shown promising performance as an alternative to recurrent neural networks in end-to-end (E2E) automatic speech recognition (ASR) systems. However, Transformer has a drawback in that the entire input sequence is required to compute both self-attention and source--target attention. In this paper, we propose a novel blockwise synchronous beam search algorithm based on blockwise processing of encoder to perform streaming E2E Transformer ASR. In the beam search, encoded feature blocks are synchronously aligned using a block boundary detection technique, where a reliability score of each predicted hypothesis is evaluated based on the end-of-sequence and repeated tokens in the hypothesis. Evaluations of the HKUST and AISHELL-1 Mandarin, LibriSpeech English, and CSJ Japanese tasks show that the proposed streaming Transformer algorithm outperforms conventional online approaches, including monotonic chunkwise attention (MoChA), especially when using the knowledge distillation technique. An ablation study indicates that our streaming approach contributes to reducing the response time, and the repetition criterion contributes significantly in certain tasks. Our streaming ASR models achieve comparable or superior performance to batch models and other streaming-based Transformer methods in all tasks considered. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2006.14941v4-abstract-full').style.display = 'none'; document.getElementById('2006.14941v4-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 November, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 June, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted for SLT 2021</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1910.11871">arXiv:1910.11871</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1910.11871">pdf</a>, <a href="https://arxiv.org/ps/1910.11871">ps</a>, <a href="https://arxiv.org/format/1910.11871">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Towards Online End-to-end Transformer Automatic Speech Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Tsunoo%2C+E">Emiru Tsunoo</a>, <a href="/search/eess?searchtype=author&amp;query=Kashiwagi%2C+Y">Yosuke Kashiwagi</a>, <a href="/search/eess?searchtype=author&amp;query=Kumakura%2C+T">Toshiyuki Kumakura</a>, <a href="/search/eess?searchtype=author&amp;query=Watanabe%2C+S">Shinji Watanabe</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1910.11871v1-abstract-short" style="display: inline;"> The Transformer self-attention network has recently shown promising performance as an alternative to recurrent neural networks in end-to-end (E2E) automatic speech recognition (ASR) systems. However, Transformer has a drawback in that the entire input sequence is required to compute self-attention. We have proposed a block processing method for the Transformer encoder by introducing a context-awar&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1910.11871v1-abstract-full').style.display = 'inline'; document.getElementById('1910.11871v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1910.11871v1-abstract-full" style="display: none;"> The Transformer self-attention network has recently shown promising performance as an alternative to recurrent neural networks in end-to-end (E2E) automatic speech recognition (ASR) systems. However, Transformer has a drawback in that the entire input sequence is required to compute self-attention. We have proposed a block processing method for the Transformer encoder by introducing a context-aware inheritance mechanism. An additional context embedding vector handed over from the previously processed block helps to encode not only local acoustic information but also global linguistic, channel, and speaker attributes. In this paper, we extend it towards an entire online E2E ASR system by introducing an online decoding process inspired by monotonic chunkwise attention (MoChA) into the Transformer decoder. Our novel MoChA training and inference algorithms exploit the unique properties of Transformer, whose attentions are not always monotonic or peaky, and have multiple heads and residual connections of the decoder layers. Evaluations of the Wall Street Journal (WSJ) and AISHELL-1 show that our proposed online Transformer decoder outperforms conventional chunkwise approaches. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1910.11871v1-abstract-full').style.display = 'none'; document.getElementById('1910.11871v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 October, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">arXiv admin note: text overlap with arXiv:1910.07204</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1910.07204">arXiv:1910.07204</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1910.07204">pdf</a>, <a href="https://arxiv.org/ps/1910.07204">ps</a>, <a href="https://arxiv.org/format/1910.07204">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Transformer ASR with Contextual Block Processing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Tsunoo%2C+E">Emiru Tsunoo</a>, <a href="/search/eess?searchtype=author&amp;query=Kashiwagi%2C+Y">Yosuke Kashiwagi</a>, <a href="/search/eess?searchtype=author&amp;query=Kumakura%2C+T">Toshiyuki Kumakura</a>, <a href="/search/eess?searchtype=author&amp;query=Watanabe%2C+S">Shinji Watanabe</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1910.07204v1-abstract-short" style="display: inline;"> The Transformer self-attention network has recently shown promising performance as an alternative to recurrent neural networks (RNNs) in end-to-end (E2E) automatic speech recognition (ASR) systems. However, the Transformer has a drawback in that the entire input sequence is required to compute self-attention. In this paper, we propose a new block processing method for the Transformer encoder by in&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1910.07204v1-abstract-full').style.display = 'inline'; document.getElementById('1910.07204v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1910.07204v1-abstract-full" style="display: none;"> The Transformer self-attention network has recently shown promising performance as an alternative to recurrent neural networks (RNNs) in end-to-end (E2E) automatic speech recognition (ASR) systems. However, the Transformer has a drawback in that the entire input sequence is required to compute self-attention. In this paper, we propose a new block processing method for the Transformer encoder by introducing a context-aware inheritance mechanism. An additional context embedding vector handed over from the previously processed block helps to encode not only local acoustic information but also global linguistic, channel, and speaker attributes. We introduce a novel mask technique to implement the context inheritance to train the model efficiently. Evaluations of the Wall Street Journal (WSJ), Librispeech, VoxForge Italian, and AISHELL-1 Mandarin speech recognition datasets show that our proposed contextual block processing method outperforms naive block processing consistently. Furthermore, the attention weight tendency of each layer is analyzed to clarify how the added contextual inheritance mechanism models the global information. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1910.07204v1-abstract-full').style.display = 'none'; document.getElementById('1910.07204v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted for ASRU 2019</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1905.07149">arXiv:1905.07149</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1905.07149">pdf</a>, <a href="https://arxiv.org/ps/1905.07149">ps</a>, <a href="https://arxiv.org/format/1905.07149">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> End-to-end Adaptation with Backpropagation through WFST for On-device Speech Recognition System </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Tsunoo%2C+E">Emiru Tsunoo</a>, <a href="/search/eess?searchtype=author&amp;query=Kashiwagi%2C+Y">Yosuke Kashiwagi</a>, <a href="/search/eess?searchtype=author&amp;query=Asakawa%2C+S">Satoshi Asakawa</a>, <a href="/search/eess?searchtype=author&amp;query=Kumakura%2C+T">Toshiyuki Kumakura</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1905.07149v3-abstract-short" style="display: inline;"> An on-device DNN-HMM speech recognition system efficiently works with a limited vocabulary in the presence of a variety of predictable noise. In such a case, vocabulary and environment adaptation is highly effective. In this paper, we propose a novel method of end-to-end (E2E) adaptation, which adjusts not only an acoustic model (AM) but also a weighted finite-state transducer (WFST). We convert a&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1905.07149v3-abstract-full').style.display = 'inline'; document.getElementById('1905.07149v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1905.07149v3-abstract-full" style="display: none;"> An on-device DNN-HMM speech recognition system efficiently works with a limited vocabulary in the presence of a variety of predictable noise. In such a case, vocabulary and environment adaptation is highly effective. In this paper, we propose a novel method of end-to-end (E2E) adaptation, which adjusts not only an acoustic model (AM) but also a weighted finite-state transducer (WFST). We convert a pretrained WFST to a trainable neural network and adapt the system to target environments/vocabulary by E2E joint training with an AM. We replicate Viterbi decoding with forward--backward neural network computation, which is similar to recurrent neural networks (RNNs). By pooling output score sequences, a vocabulary posterior for each utterance is obtained and used for discriminative loss computation. Experiments using 2--10 hours of English/Japanese adaptation datasets indicate that the fine-tuning of only WFSTs and that of only AMs are both comparable to a state-of-the-art adaptation method, and E2E joint training of the two components achieves the best recognition performance. We also adapt each language system to the other language using the adaptation data, and the results show that the proposed method also works well for language adaptations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1905.07149v3-abstract-full').style.display = 'none'; document.getElementById('1905.07149v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 June, 2019; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 May, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted for Interspeech 2019</span> </p> </li> </ol> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

Pages: 1 2 3 4 5 6 7 8 9 10