Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–27 of 27 results for author: <span class="mathjax">Wiesner, M</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Wiesner%2C+M">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Wiesner, M"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Wiesner%2C+M&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Wiesner, M"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05674">arXiv:2502.05674</a> <span> [<a href="https://arxiv.org/pdf/2502.05674">pdf</a>, <a href="https://arxiv.org/format/2502.05674">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Less is More for Synthetic Speech Detection in the Wild </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Garg%2C+A">Ashi Garg</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+Z">Zexin Cai</a>, <a href="/search/cs?searchtype=author&query=Xinyuan%2C+H+L">Henry Li Xinyuan</a>, <a href="/search/cs?searchtype=author&query=Garc%C3%ADa-Perera%2C+L+P">Leibny Paola Garc铆a-Perera</a>, <a href="/search/cs?searchtype=author&query=Duh%2C+K">Kevin Duh</a>, <a href="/search/cs?searchtype=author&query=Khudanpur%2C+S">Sanjeev Khudanpur</a>, <a href="/search/cs?searchtype=author&query=Wiesner%2C+M">Matthew Wiesner</a>, <a href="/search/cs?searchtype=author&query=Andrews%2C+N">Nicholas Andrews</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05674v3-abstract-short" style="display: inline;"> Driven by advances in self-supervised learning for speech, state-of-the-art synthetic speech detectors have achieved low error rates on popular benchmarks such as ASVspoof. However, prior benchmarks do not address the wide range of real-world variability in speech. Are reported error rates realistic in real-world conditions? To assess detector failure modes and robustness under controlled distribu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05674v3-abstract-full').style.display = 'inline'; document.getElementById('2502.05674v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05674v3-abstract-full" style="display: none;"> Driven by advances in self-supervised learning for speech, state-of-the-art synthetic speech detectors have achieved low error rates on popular benchmarks such as ASVspoof. However, prior benchmarks do not address the wide range of real-world variability in speech. Are reported error rates realistic in real-world conditions? To assess detector failure modes and robustness under controlled distribution shifts, we introduce ShiftySpeech, a benchmark with more than 3000 hours of synthetic speech from 7 domains, 6 TTS systems, 12 vocoders, and 3 languages. We found that all distribution shifts degraded model performance, and contrary to prior findings, training on more vocoders, speakers, or with data augmentation did not guarantee better generalization. In fact, we found that training on less diverse data resulted in better generalization, and that a detector fit using samples from a single carefully selected vocoder and a small number of speakers, without data augmentations, achieved state-of-the-art results on the challenging In-the-Wild benchmark. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05674v3-abstract-full').style.display = 'none'; document.getElementById('2502.05674v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.04519">arXiv:2502.04519</a> <span> [<a href="https://arxiv.org/pdf/2502.04519">pdf</a>, <a href="https://arxiv.org/format/2502.04519">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> GenVC: Self-Supervised Zero-Shot Voice Conversion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cai%2C+Z">Zexin Cai</a>, <a href="/search/cs?searchtype=author&query=Xinyuan%2C+H+L">Henry Li Xinyuan</a>, <a href="/search/cs?searchtype=author&query=Garg%2C+A">Ashi Garg</a>, <a href="/search/cs?searchtype=author&query=Garc%C3%ADa-Perera%2C+L+P">Leibny Paola Garc铆a-Perera</a>, <a href="/search/cs?searchtype=author&query=Duh%2C+K">Kevin Duh</a>, <a href="/search/cs?searchtype=author&query=Khudanpur%2C+S">Sanjeev Khudanpur</a>, <a href="/search/cs?searchtype=author&query=Wiesner%2C+M">Matthew Wiesner</a>, <a href="/search/cs?searchtype=author&query=Andrews%2C+N">Nicholas Andrews</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.04519v1-abstract-short" style="display: inline;"> Zero-shot voice conversion has recently made substantial progress, but many models still depend on external supervised systems to disentangle speaker identity and linguistic content. Furthermore, current methods often use parallel conversion, where the converted speech inherits the source utterance's temporal structure, restricting speaker similarity and privacy. To overcome these limitations, we… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04519v1-abstract-full').style.display = 'inline'; document.getElementById('2502.04519v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.04519v1-abstract-full" style="display: none;"> Zero-shot voice conversion has recently made substantial progress, but many models still depend on external supervised systems to disentangle speaker identity and linguistic content. Furthermore, current methods often use parallel conversion, where the converted speech inherits the source utterance's temporal structure, restricting speaker similarity and privacy. To overcome these limitations, we introduce GenVC, a generative zero-shot voice conversion model. GenVC learns to disentangle linguistic content and speaker style in a self-supervised manner, eliminating the need for external models and enabling efficient training on large, unlabeled datasets. Experimental results show that GenVC achieves state-of-the-art speaker similarity while maintaining naturalness competitive with leading approaches. Its autoregressive generation also allows the converted speech to deviate from the source utterance's temporal structure. This feature makes GenVC highly effective for voice anonymization, as it minimizes the preservation of source prosody and speaker characteristics, enhancing privacy protection. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04519v1-abstract-full').style.display = 'none'; document.getElementById('2502.04519v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.00114">arXiv:2501.00114</a> <span> [<a href="https://arxiv.org/pdf/2501.00114">pdf</a>, <a href="https://arxiv.org/format/2501.00114">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> DiCoW: Diarization-Conditioned Whisper for Target Speaker Automatic Speech Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Polok%2C+A">Alexander Polok</a>, <a href="/search/cs?searchtype=author&query=Klement%2C+D">Dominik Klement</a>, <a href="/search/cs?searchtype=author&query=Kocour%2C+M">Martin Kocour</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Jiangyu Han</a>, <a href="/search/cs?searchtype=author&query=Landini%2C+F">Federico Landini</a>, <a href="/search/cs?searchtype=author&query=Yusuf%2C+B">Bolaji Yusuf</a>, <a href="/search/cs?searchtype=author&query=Wiesner%2C+M">Matthew Wiesner</a>, <a href="/search/cs?searchtype=author&query=Khudanpur%2C+S">Sanjeev Khudanpur</a>, <a href="/search/cs?searchtype=author&query=%C4%8Cernock%C3%BD%2C+J">Jan 膶ernock媒</a>, <a href="/search/cs?searchtype=author&query=Burget%2C+L">Luk谩拧 Burget</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.00114v1-abstract-short" style="display: inline;"> Speaker-attributed automatic speech recognition (ASR) in multi-speaker environments remains a significant challenge, particularly when systems conditioned on speaker embeddings fail to generalize to unseen speakers. In this work, we propose Diarization-Conditioned Whisper (DiCoW), a novel approach to target-speaker ASR that leverages speaker diarization outputs as conditioning information. DiCoW e… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.00114v1-abstract-full').style.display = 'inline'; document.getElementById('2501.00114v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.00114v1-abstract-full" style="display: none;"> Speaker-attributed automatic speech recognition (ASR) in multi-speaker environments remains a significant challenge, particularly when systems conditioned on speaker embeddings fail to generalize to unseen speakers. In this work, we propose Diarization-Conditioned Whisper (DiCoW), a novel approach to target-speaker ASR that leverages speaker diarization outputs as conditioning information. DiCoW extends the pre-trained Whisper model by integrating diarization labels directly, eliminating reliance on speaker embeddings and reducing the need for extensive speaker-specific training data. Our method introduces frame-level diarization-dependent transformations (FDDT) and query-key biasing (QKb) techniques to refine the model's focus on target speakers while effectively handling overlapping speech. By leveraging diarization outputs as conditioning signals, DiCoW simplifies the workflow for multi-speaker ASR, improves generalization to unseen speakers and enables more reliable transcription in real-world multi-speaker recordings. Additionally, we explore the integration of a connectionist temporal classification (CTC) head to Whisper and demonstrate its ability to improve transcription efficiency through hybrid decoding. Notably, we show that our approach is not limited to Whisper; it also provides similar benefits when applied to the Branchformer model. We validate DiCoW on real-world datasets, including AMI and NOTSOFAR-1 from CHiME-8 challenge, as well as synthetic benchmarks such as Libri2Mix and LibriCSS, enabling direct comparisons with previous methods. Results demonstrate that DiCoW enhances the model's target-speaker ASR capabilities while maintaining Whisper's accuracy and robustness on single-speaker data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.00114v1-abstract-full').style.display = 'none'; document.getElementById('2501.00114v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.09543">arXiv:2409.09543</a> <span> [<a href="https://arxiv.org/pdf/2409.09543">pdf</a>, <a href="https://arxiv.org/format/2409.09543">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Target Speaker ASR with Whisper </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Polok%2C+A">Alexander Polok</a>, <a href="/search/cs?searchtype=author&query=Klement%2C+D">Dominik Klement</a>, <a href="/search/cs?searchtype=author&query=Wiesner%2C+M">Matthew Wiesner</a>, <a href="/search/cs?searchtype=author&query=Khudanpur%2C+S">Sanjeev Khudanpur</a>, <a href="/search/cs?searchtype=author&query=%C4%8Cernock%C3%BD%2C+J">Jan 膶ernock媒</a>, <a href="/search/cs?searchtype=author&query=Burget%2C+L">Luk谩拧 Burget</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.09543v2-abstract-short" style="display: inline;"> We propose a novel approach to enable the use of large, single-speaker ASR models, such as Whisper, for target speaker ASR. The key claim of this method is that it is much easier to model relative differences among speakers by learning to condition on frame-level diarization outputs than to learn the space of all speaker embeddings. We find that adding even a single bias term per diarization outpu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09543v2-abstract-full').style.display = 'inline'; document.getElementById('2409.09543v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.09543v2-abstract-full" style="display: none;"> We propose a novel approach to enable the use of large, single-speaker ASR models, such as Whisper, for target speaker ASR. The key claim of this method is that it is much easier to model relative differences among speakers by learning to condition on frame-level diarization outputs than to learn the space of all speaker embeddings. We find that adding even a single bias term per diarization output type before the first transformer block can transform single-speaker ASR models into target-speaker ASR models. Our approach also supports speaker-attributed ASR by sequentially generating transcripts for each speaker in a diarization output. This simplified method outperforms baseline speech separation and diarization cascade by 12.9 % absolute ORC-WER on the NOTSOFAR-1 dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09543v2-abstract-full').style.display = 'none'; document.getElementById('2409.09543v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ICASSP 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.08913">arXiv:2409.08913</a> <span> [<a href="https://arxiv.org/pdf/2409.08913">pdf</a>, <a href="https://arxiv.org/format/2409.08913">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> HLTCOE JHU Submission to the Voice Privacy Challenge 2024 </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xinyuan%2C+H+L">Henry Li Xinyuan</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+Z">Zexin Cai</a>, <a href="/search/cs?searchtype=author&query=Garg%2C+A">Ashi Garg</a>, <a href="/search/cs?searchtype=author&query=Duh%2C+K">Kevin Duh</a>, <a href="/search/cs?searchtype=author&query=Garc%C3%ADa-Perera%2C+L+P">Leibny Paola Garc铆a-Perera</a>, <a href="/search/cs?searchtype=author&query=Khudanpur%2C+S">Sanjeev Khudanpur</a>, <a href="/search/cs?searchtype=author&query=Andrews%2C+N">Nicholas Andrews</a>, <a href="/search/cs?searchtype=author&query=Wiesner%2C+M">Matthew Wiesner</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.08913v2-abstract-short" style="display: inline;"> We present a number of systems for the Voice Privacy Challenge, including voice conversion based systems such as the kNN-VC method and the WavLM voice Conversion method, and text-to-speech (TTS) based systems including Whisper-VITS. We found that while voice conversion systems better preserve emotional content, they struggle to conceal speaker identity in semi-white-box attack scenarios; conversel… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08913v2-abstract-full').style.display = 'inline'; document.getElementById('2409.08913v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.08913v2-abstract-full" style="display: none;"> We present a number of systems for the Voice Privacy Challenge, including voice conversion based systems such as the kNN-VC method and the WavLM voice Conversion method, and text-to-speech (TTS) based systems including Whisper-VITS. We found that while voice conversion systems better preserve emotional content, they struggle to conceal speaker identity in semi-white-box attack scenarios; conversely, TTS methods perform better at anonymization and worse at emotion preservation. Finally, we propose a random admixture system which seeks to balance out the strengths and weaknesses of the two category of systems, achieving a strong EER of over 40% while maintaining UAR at a respectable 47%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08913v2-abstract-full').style.display = 'none'; document.getElementById('2409.08913v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submission to the Voice Privacy Challenge 2024. Accepted and presented at the 4th Symposium on Security and Privacy in Speech Communication</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.03655">arXiv:2409.03655</a> <span> [<a href="https://arxiv.org/pdf/2409.03655">pdf</a>, <a href="https://arxiv.org/format/2409.03655">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Privacy versus Emotion Preservation Trade-offs in Emotion-Preserving Speaker Anonymization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cai%2C+Z">Zexin Cai</a>, <a href="/search/cs?searchtype=author&query=Xinyuan%2C+H+L">Henry Li Xinyuan</a>, <a href="/search/cs?searchtype=author&query=Garg%2C+A">Ashi Garg</a>, <a href="/search/cs?searchtype=author&query=Garc%C3%ADa-Perera%2C+L+P">Leibny Paola Garc铆a-Perera</a>, <a href="/search/cs?searchtype=author&query=Duh%2C+K">Kevin Duh</a>, <a href="/search/cs?searchtype=author&query=Khudanpur%2C+S">Sanjeev Khudanpur</a>, <a href="/search/cs?searchtype=author&query=Andrews%2C+N">Nicholas Andrews</a>, <a href="/search/cs?searchtype=author&query=Wiesner%2C+M">Matthew Wiesner</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.03655v1-abstract-short" style="display: inline;"> Advances in speech technology now allow unprecedented access to personally identifiable information through speech. To protect such information, the differential privacy field has explored ways to anonymize speech while preserving its utility, including linguistic and paralinguistic aspects. However, anonymizing speech while maintaining emotional state remains challenging. We explore this problem… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.03655v1-abstract-full').style.display = 'inline'; document.getElementById('2409.03655v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.03655v1-abstract-full" style="display: none;"> Advances in speech technology now allow unprecedented access to personally identifiable information through speech. To protect such information, the differential privacy field has explored ways to anonymize speech while preserving its utility, including linguistic and paralinguistic aspects. However, anonymizing speech while maintaining emotional state remains challenging. We explore this problem in the context of the VoicePrivacy 2024 challenge. Specifically, we developed various speaker anonymization pipelines and find that approaches either excel at anonymization or preserving emotion state, but not both simultaneously. Achieving both would require an in-domain emotion recognizer. Additionally, we found that it is feasible to train a semi-effective speaker verification system using only emotion representations, demonstrating the challenge of separating these two modalities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.03655v1-abstract-full').style.display = 'none'; document.getElementById('2409.03655v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted by 2024 IEEE Spoken Language Technology Workshop</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.16447">arXiv:2407.16447</a> <span> [<a href="https://arxiv.org/pdf/2407.16447">pdf</a>, <a href="https://arxiv.org/ps/2407.16447">ps</a>, <a href="https://arxiv.org/format/2407.16447">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> The CHiME-8 DASR Challenge for Generalizable and Array Agnostic Distant Automatic Speech Recognition and Diarization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cornell%2C+S">Samuele Cornell</a>, <a href="/search/cs?searchtype=author&query=Park%2C+T">Taejin Park</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+S">Steve Huang</a>, <a href="/search/cs?searchtype=author&query=Boeddeker%2C+C">Christoph Boeddeker</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+X">Xuankai Chang</a>, <a href="/search/cs?searchtype=author&query=Maciejewski%2C+M">Matthew Maciejewski</a>, <a href="/search/cs?searchtype=author&query=Wiesner%2C+M">Matthew Wiesner</a>, <a href="/search/cs?searchtype=author&query=Garcia%2C+P">Paola Garcia</a>, <a href="/search/cs?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.16447v1-abstract-short" style="display: inline;"> This paper presents the CHiME-8 DASR challenge which carries on from the previous edition CHiME-7 DASR (C7DASR) and the past CHiME-6 challenge. It focuses on joint multi-channel distant speech recognition (DASR) and diarization with one or more, possibly heterogeneous, devices. The main goal is to spur research towards meeting transcription approaches that can generalize across arbitrary number of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.16447v1-abstract-full').style.display = 'inline'; document.getElementById('2407.16447v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.16447v1-abstract-full" style="display: none;"> This paper presents the CHiME-8 DASR challenge which carries on from the previous edition CHiME-7 DASR (C7DASR) and the past CHiME-6 challenge. It focuses on joint multi-channel distant speech recognition (DASR) and diarization with one or more, possibly heterogeneous, devices. The main goal is to spur research towards meeting transcription approaches that can generalize across arbitrary number of speakers, diverse settings (formal vs. informal conversations), meeting duration, wide-variety of acoustic scenarios and different recording configurations. Novelties with respect to C7DASR include: i) the addition of NOTSOFAR-1, an additional office/corporate meeting scenario, ii) a manually corrected Mixer 6 development set, iii) a new track in which we allow the use of large-language models (LLM) iv) a jury award mechanism to encourage participants to explore also more practical and innovative solutions. To lower the entry barrier for participants, we provide a standalone toolkit for downloading and preparing such datasets as well as performing text normalization and scoring their submissions. Furthermore, this year we also provide two baseline systems, one directly inherited from C7DASR and based on ESPnet and another one developed on NeMo and based on NeMo team submission in last year C7DASR. Baseline system results suggest that the addition of the NOTSOFAR-1 scenario significantly increases the task's difficulty due to its high number of speakers and very short duration. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.16447v1-abstract-full').style.display = 'none'; document.getElementById('2407.16447v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.02560">arXiv:2406.02560</a> <span> [<a href="https://arxiv.org/pdf/2406.02560">pdf</a>, <a href="https://arxiv.org/format/2406.02560">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Less Peaky and More Accurate CTC Forced Alignment by Label Priors </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+R">Ruizhe Huang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiaohui Zhang</a>, <a href="/search/cs?searchtype=author&query=Ni%2C+Z">Zhaoheng Ni</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+L">Li Sun</a>, <a href="/search/cs?searchtype=author&query=Hira%2C+M">Moto Hira</a>, <a href="/search/cs?searchtype=author&query=Hwang%2C+J">Jeff Hwang</a>, <a href="/search/cs?searchtype=author&query=Manohar%2C+V">Vimal Manohar</a>, <a href="/search/cs?searchtype=author&query=Pratap%2C+V">Vineel Pratap</a>, <a href="/search/cs?searchtype=author&query=Wiesner%2C+M">Matthew Wiesner</a>, <a href="/search/cs?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a>, <a href="/search/cs?searchtype=author&query=Povey%2C+D">Daniel Povey</a>, <a href="/search/cs?searchtype=author&query=Khudanpur%2C+S">Sanjeev Khudanpur</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.02560v3-abstract-short" style="display: inline;"> Connectionist temporal classification (CTC) models are known to have peaky output distributions. Such behavior is not a problem for automatic speech recognition (ASR), but it can cause inaccurate forced alignments (FA), especially at finer granularity, e.g., phoneme level. This paper aims at alleviating the peaky behavior for CTC and improve its suitability for forced alignment generation, by leve… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.02560v3-abstract-full').style.display = 'inline'; document.getElementById('2406.02560v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.02560v3-abstract-full" style="display: none;"> Connectionist temporal classification (CTC) models are known to have peaky output distributions. Such behavior is not a problem for automatic speech recognition (ASR), but it can cause inaccurate forced alignments (FA), especially at finer granularity, e.g., phoneme level. This paper aims at alleviating the peaky behavior for CTC and improve its suitability for forced alignment generation, by leveraging label priors, so that the scores of alignment paths containing fewer blanks are boosted and maximized during training. As a result, our CTC model produces less peaky posteriors and is able to more accurately predict the offset of the tokens besides their onset. It outperforms the standard CTC model and a heuristics-based approach for obtaining CTC's token offset timestamps by 12-40% in phoneme and word boundary errors (PBE and WBE) measured on the Buckeye and TIMIT data. Compared with the most widely used FA toolkit Montreal Forced Aligner (MFA), our method performs similarly on PBE/WBE on Buckeye, yet falls behind MFA on TIMIT. Nevertheless, our method has a much simpler training pipeline and better runtime efficiency. Our training recipe and pretrained model are released in TorchAudio. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.02560v3-abstract-full').style.display = 'none'; document.getElementById('2406.02560v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICASSP 2024. Github repo: https://github.com/huangruizhe/audio/tree/aligner_label_priors</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.15676">arXiv:2401.15676</a> <span> [<a href="https://arxiv.org/pdf/2401.15676">pdf</a>, <a href="https://arxiv.org/format/2401.15676">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> On Speaker Attribution with SURT </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Raj%2C+D">Desh Raj</a>, <a href="/search/cs?searchtype=author&query=Wiesner%2C+M">Matthew Wiesner</a>, <a href="/search/cs?searchtype=author&query=Maciejewski%2C+M">Matthew Maciejewski</a>, <a href="/search/cs?searchtype=author&query=Garcia-Perera%2C+L+P">Leibny Paola Garcia-Perera</a>, <a href="/search/cs?searchtype=author&query=Povey%2C+D">Daniel Povey</a>, <a href="/search/cs?searchtype=author&query=Khudanpur%2C+S">Sanjeev Khudanpur</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.15676v1-abstract-short" style="display: inline;"> The Streaming Unmixing and Recognition Transducer (SURT) has recently become a popular framework for continuous, streaming, multi-talker speech recognition (ASR). With advances in architecture, objectives, and mixture simulation methods, it was demonstrated that SURT can be an efficient streaming method for speaker-agnostic transcription of real meetings. In this work, we push this framework furth… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.15676v1-abstract-full').style.display = 'inline'; document.getElementById('2401.15676v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.15676v1-abstract-full" style="display: none;"> The Streaming Unmixing and Recognition Transducer (SURT) has recently become a popular framework for continuous, streaming, multi-talker speech recognition (ASR). With advances in architecture, objectives, and mixture simulation methods, it was demonstrated that SURT can be an efficient streaming method for speaker-agnostic transcription of real meetings. In this work, we push this framework further by proposing methods to perform speaker-attributed transcription with SURT, for both short mixtures and long recordings. We achieve this by adding an auxiliary speaker branch to SURT, and synchronizing its label prediction with ASR token prediction through HAT-style blank factorization. In order to ensure consistency in relative speaker labels across different utterance groups in a recording, we propose "speaker prefixing" -- appending each chunk with high-confidence frames of speakers identified in previous chunks, to establish the relative order. We perform extensive ablation experiments on synthetic LibriSpeech mixtures to validate our design choices, and demonstrate the efficacy of our final model on the AMI corpus. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.15676v1-abstract-full').style.display = 'none'; document.getElementById('2401.15676v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 6 figures, 6 tables. Submitted to Odyssey 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.15674">arXiv:2309.15674</a> <span> [<a href="https://arxiv.org/pdf/2309.15674">pdf</a>, <a href="https://arxiv.org/format/2309.15674">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Speech collage: code-switched audio generation by collaging monolingual corpora </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hussein%2C+A">Amir Hussein</a>, <a href="/search/cs?searchtype=author&query=Zeinali%2C+D">Dorsa Zeinali</a>, <a href="/search/cs?searchtype=author&query=Klejch%2C+O">Ond艡ej Klejch</a>, <a href="/search/cs?searchtype=author&query=Wiesner%2C+M">Matthew Wiesner</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+B">Brian Yan</a>, <a href="/search/cs?searchtype=author&query=Chowdhury%2C+S">Shammur Chowdhury</a>, <a href="/search/cs?searchtype=author&query=Ali%2C+A">Ahmed Ali</a>, <a href="/search/cs?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a>, <a href="/search/cs?searchtype=author&query=Khudanpur%2C+S">Sanjeev Khudanpur</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.15674v1-abstract-short" style="display: inline;"> Designing effective automatic speech recognition (ASR) systems for Code-Switching (CS) often depends on the availability of the transcribed CS resources. To address data scarcity, this paper introduces Speech Collage, a method that synthesizes CS data from monolingual corpora by splicing audio segments. We further improve the smoothness quality of audio generation using an overlap-add approach. We… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.15674v1-abstract-full').style.display = 'inline'; document.getElementById('2309.15674v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.15674v1-abstract-full" style="display: none;"> Designing effective automatic speech recognition (ASR) systems for Code-Switching (CS) often depends on the availability of the transcribed CS resources. To address data scarcity, this paper introduces Speech Collage, a method that synthesizes CS data from monolingual corpora by splicing audio segments. We further improve the smoothness quality of audio generation using an overlap-add approach. We investigate the impact of generated data on speech recognition in two scenarios: using in-domain CS text and a zero-shot approach with synthesized CS text. Empirical results highlight up to 34.4% and 16.2% relative reductions in Mixed-Error Rate and Word-Error Rate for in-domain and zero-shot scenarios, respectively. Lastly, we demonstrate that CS augmentation bolsters the model's code-switching inclination and reduces its monolingual bias. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.15674v1-abstract-full').style.display = 'none'; document.getElementById('2309.15674v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.13734">arXiv:2306.13734</a> <span> [<a href="https://arxiv.org/pdf/2306.13734">pdf</a>, <a href="https://arxiv.org/format/2306.13734">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> The CHiME-7 DASR Challenge: Distant Meeting Transcription with Multiple Devices in Diverse Scenarios </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cornell%2C+S">Samuele Cornell</a>, <a href="/search/cs?searchtype=author&query=Wiesner%2C+M">Matthew Wiesner</a>, <a href="/search/cs?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a>, <a href="/search/cs?searchtype=author&query=Raj%2C+D">Desh Raj</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+X">Xuankai Chang</a>, <a href="/search/cs?searchtype=author&query=Garcia%2C+P">Paola Garcia</a>, <a href="/search/cs?searchtype=author&query=Maciejewski%2C+M">Matthew Maciejewski</a>, <a href="/search/cs?searchtype=author&query=Masuyama%2C+Y">Yoshiki Masuyama</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhong-Qiu Wang</a>, <a href="/search/cs?searchtype=author&query=Squartini%2C+S">Stefano Squartini</a>, <a href="/search/cs?searchtype=author&query=Khudanpur%2C+S">Sanjeev Khudanpur</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.13734v2-abstract-short" style="display: inline;"> The CHiME challenges have played a significant role in the development and evaluation of robust automatic speech recognition (ASR) systems. We introduce the CHiME-7 distant ASR (DASR) task, within the 7th CHiME challenge. This task comprises joint ASR and diarization in far-field settings with multiple, and possibly heterogeneous, recording devices. Different from previous challenges, we evaluate… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.13734v2-abstract-full').style.display = 'inline'; document.getElementById('2306.13734v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.13734v2-abstract-full" style="display: none;"> The CHiME challenges have played a significant role in the development and evaluation of robust automatic speech recognition (ASR) systems. We introduce the CHiME-7 distant ASR (DASR) task, within the 7th CHiME challenge. This task comprises joint ASR and diarization in far-field settings with multiple, and possibly heterogeneous, recording devices. Different from previous challenges, we evaluate systems on 3 diverse scenarios: CHiME-6, DiPCo, and Mixer 6. The goal is for participants to devise a single system that can generalize across different array geometries and use cases with no a-priori information. Another departure from earlier CHiME iterations is that participants are allowed to use open-source pre-trained models and datasets. In this paper, we describe the challenge design, motivation, and fundamental research questions in detail. We also present the baseline system, which is fully array-topology agnostic and features multi-channel diarization, channel selection, guided source separation and a robust ASR model that leverages self-supervised speech representations (SSLR). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.13734v2-abstract-full').style.display = 'none'; document.getElementById('2306.13734v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.11252">arXiv:2306.11252</a> <span> [<a href="https://arxiv.org/pdf/2306.11252">pdf</a>, <a href="https://arxiv.org/format/2306.11252">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> HK-LegiCoST: Leveraging Non-Verbatim Transcripts for Speech Translation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiao%2C+C">Cihan Xiao</a>, <a href="/search/cs?searchtype=author&query=Xinyuan%2C+H+L">Henry Li Xinyuan</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jinyi Yang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+D">Dongji Gao</a>, <a href="/search/cs?searchtype=author&query=Wiesner%2C+M">Matthew Wiesner</a>, <a href="/search/cs?searchtype=author&query=Duh%2C+K">Kevin Duh</a>, <a href="/search/cs?searchtype=author&query=Khudanpur%2C+S">Sanjeev Khudanpur</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.11252v1-abstract-short" style="display: inline;"> We introduce HK-LegiCoST, a new three-way parallel corpus of Cantonese-English translations, containing 600+ hours of Cantonese audio, its standard traditional Chinese transcript, and English translation, segmented and aligned at the sentence level. We describe the notable challenges in corpus preparation: segmentation, alignment of long audio recordings, and sentence-level alignment with non-verb… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.11252v1-abstract-full').style.display = 'inline'; document.getElementById('2306.11252v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.11252v1-abstract-full" style="display: none;"> We introduce HK-LegiCoST, a new three-way parallel corpus of Cantonese-English translations, containing 600+ hours of Cantonese audio, its standard traditional Chinese transcript, and English translation, segmented and aligned at the sentence level. We describe the notable challenges in corpus preparation: segmentation, alignment of long audio recordings, and sentence-level alignment with non-verbatim transcripts. Such transcripts make the corpus suitable for speech translation research when there are significant differences between the spoken and written forms of the source language. Due to its large size, we are able to demonstrate competitive speech translation baselines on HK-LegiCoST and extend them to promising cross-corpus results on the FLEURS Cantonese subset. These results deliver insights into speech recognition and translation research in languages for which non-verbatim or ``noisy'' transcription is common due to various factors, including vernacular and dialectal speech. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.11252v1-abstract-full').style.display = 'none'; document.getElementById('2306.11252v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.01031">arXiv:2306.01031</a> <span> [<a href="https://arxiv.org/pdf/2306.01031">pdf</a>, <a href="https://arxiv.org/format/2306.01031">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Bypass Temporal Classification: Weakly Supervised Automatic Speech Recognition with Imperfect Transcripts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gao%2C+D">Dongji Gao</a>, <a href="/search/cs?searchtype=author&query=Wiesner%2C+M">Matthew Wiesner</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+H">Hainan Xu</a>, <a href="/search/cs?searchtype=author&query=Garcia%2C+L+P">Leibny Paola Garcia</a>, <a href="/search/cs?searchtype=author&query=Povey%2C+D">Daniel Povey</a>, <a href="/search/cs?searchtype=author&query=Khudanpur%2C+S">Sanjeev Khudanpur</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.01031v1-abstract-short" style="display: inline;"> This paper presents a novel algorithm for building an automatic speech recognition (ASR) model with imperfect training data. Imperfectly transcribed speech is a prevalent issue in human-annotated speech corpora, which degrades the performance of ASR models. To address this problem, we propose Bypass Temporal Classification (BTC) as an expansion of the Connectionist Temporal Classification (CTC) cr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.01031v1-abstract-full').style.display = 'inline'; document.getElementById('2306.01031v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.01031v1-abstract-full" style="display: none;"> This paper presents a novel algorithm for building an automatic speech recognition (ASR) model with imperfect training data. Imperfectly transcribed speech is a prevalent issue in human-annotated speech corpora, which degrades the performance of ASR models. To address this problem, we propose Bypass Temporal Classification (BTC) as an expansion of the Connectionist Temporal Classification (CTC) criterion. BTC explicitly encodes the uncertainties associated with transcripts during training. This is accomplished by enhancing the flexibility of the training graph, which is implemented as a weighted finite-state transducer (WFST) composition. The proposed algorithm improves the robustness and accuracy of ASR systems, particularly when working with imprecisely transcribed speech corpora. Our implementation will be open-sourced. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.01031v1-abstract-full').style.display = 'none'; document.getElementById('2306.01031v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2211.01458">arXiv:2211.01458</a> <span> [<a href="https://arxiv.org/pdf/2211.01458">pdf</a>, <a href="https://arxiv.org/format/2211.01458">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Towards Zero-Shot Code-Switched Speech Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yan%2C+B">Brian Yan</a>, <a href="/search/cs?searchtype=author&query=Wiesner%2C+M">Matthew Wiesner</a>, <a href="/search/cs?searchtype=author&query=Klejch%2C+O">Ondrej Klejch</a>, <a href="/search/cs?searchtype=author&query=Jyothi%2C+P">Preethi Jyothi</a>, <a href="/search/cs?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2211.01458v2-abstract-short" style="display: inline;"> In this work, we seek to build effective code-switched (CS) automatic speech recognition systems (ASR) under the zero-shot setting where no transcribed CS speech data is available for training. Previously proposed frameworks which conditionally factorize the bilingual task into its constituent monolingual parts are a promising starting point for leveraging monolingual data efficiently. However, th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.01458v2-abstract-full').style.display = 'inline'; document.getElementById('2211.01458v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2211.01458v2-abstract-full" style="display: none;"> In this work, we seek to build effective code-switched (CS) automatic speech recognition systems (ASR) under the zero-shot setting where no transcribed CS speech data is available for training. Previously proposed frameworks which conditionally factorize the bilingual task into its constituent monolingual parts are a promising starting point for leveraging monolingual data efficiently. However, these methods require the monolingual modules to perform language segmentation. That is, each monolingual module has to simultaneously detect CS points and transcribe speech segments of one language while ignoring those of other languages -- not a trivial task. We propose to simplify each monolingual module by allowing them to transcribe all speech segments indiscriminately with a monolingual script (i.e. transliteration). This simple modification passes the responsibility of CS point detection to subsequent bilingual modules which determine the final output by considering multiple monolingual transliterations along with external language model information. We apply this transliteration-based approach in an end-to-end differentiable neural network and demonstrate its efficacy for zero-shot CS ASR on Mandarin-English SEAME test sets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.01458v2-abstract-full').style.display = 'none'; document.getElementById('2211.01458v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2110.04863">arXiv:2110.04863</a> <span> [<a href="https://arxiv.org/pdf/2110.04863">pdf</a>, <a href="https://arxiv.org/format/2110.04863">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Injecting Text and Cross-lingual Supervision in Few-shot Learning from Self-Supervised Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wiesner%2C+M">Matthew Wiesner</a>, <a href="/search/cs?searchtype=author&query=Raj%2C+D">Desh Raj</a>, <a href="/search/cs?searchtype=author&query=Khudanpur%2C+S">Sanjeev Khudanpur</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2110.04863v1-abstract-short" style="display: inline;"> Self-supervised model pre-training has recently garnered significant interest, but relatively few efforts have explored using additional resources in fine-tuning these models. We demonstrate how universal phoneset acoustic models can leverage cross-lingual supervision to improve transfer of pretrained self-supervised representations to new languages. We also show how target-language text can be us… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.04863v1-abstract-full').style.display = 'inline'; document.getElementById('2110.04863v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2110.04863v1-abstract-full" style="display: none;"> Self-supervised model pre-training has recently garnered significant interest, but relatively few efforts have explored using additional resources in fine-tuning these models. We demonstrate how universal phoneset acoustic models can leverage cross-lingual supervision to improve transfer of pretrained self-supervised representations to new languages. We also show how target-language text can be used to enable and improve fine-tuning with the lattice-free maximum mutual information (LF-MMI) objective. In three low-resource languages these techniques greatly improved few-shot learning performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.04863v1-abstract-full').style.display = 'none'; document.getElementById('2110.04863v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">\c{opyright} 2021 IEEE. Personal use of this material is permitted. Permission from IEEE must be obtained for all other uses, in any current or future media, including reprinting/republishing this material for advertising or promotional purposes, creating new collective works, for resale or redistribution to servers or lists, or reuse of any copyrighted component of this work in other works</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2102.01757">arXiv:2102.01757</a> <span> [<a href="https://arxiv.org/pdf/2102.01757">pdf</a>, <a href="https://arxiv.org/format/2102.01757">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> The Multilingual TEDx Corpus for Speech Recognition and Translation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Salesky%2C+E">Elizabeth Salesky</a>, <a href="/search/cs?searchtype=author&query=Wiesner%2C+M">Matthew Wiesner</a>, <a href="/search/cs?searchtype=author&query=Bremerman%2C+J">Jacob Bremerman</a>, <a href="/search/cs?searchtype=author&query=Cattoni%2C+R">Roldano Cattoni</a>, <a href="/search/cs?searchtype=author&query=Negri%2C+M">Matteo Negri</a>, <a href="/search/cs?searchtype=author&query=Turchi%2C+M">Marco Turchi</a>, <a href="/search/cs?searchtype=author&query=Oard%2C+D+W">Douglas W. Oard</a>, <a href="/search/cs?searchtype=author&query=Post%2C+M">Matt Post</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2102.01757v2-abstract-short" style="display: inline;"> We present the Multilingual TEDx corpus, built to support speech recognition (ASR) and speech translation (ST) research across many non-English source languages. The corpus is a collection of audio recordings from TEDx talks in 8 source languages. We segment transcripts into sentences and align them to the source-language audio and target-language translations. The corpus is released along with op… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2102.01757v2-abstract-full').style.display = 'inline'; document.getElementById('2102.01757v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2102.01757v2-abstract-full" style="display: none;"> We present the Multilingual TEDx corpus, built to support speech recognition (ASR) and speech translation (ST) research across many non-English source languages. The corpus is a collection of audio recordings from TEDx talks in 8 source languages. We segment transcripts into sentences and align them to the source-language audio and target-language translations. The corpus is released along with open-sourced code enabling extension to new talks and languages as they become available. Our corpus creation methodology can be applied to more languages than previous work, and creates multi-way parallel evaluation sets. We provide baselines in multiple ASR and ST settings, including multilingual models to improve translation performance for low-resource language pairs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2102.01757v2-abstract-full').style.display = 'none'; document.getElementById('2102.01757v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 June, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 February, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to Interspeech 2021</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2005.13962">arXiv:2005.13962</a> <span> [<a href="https://arxiv.org/pdf/2005.13962">pdf</a>, <a href="https://arxiv.org/format/2005.13962">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> A Corpus for Large-Scale Phonetic Typology </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Salesky%2C+E">Elizabeth Salesky</a>, <a href="/search/cs?searchtype=author&query=Chodroff%2C+E">Eleanor Chodroff</a>, <a href="/search/cs?searchtype=author&query=Pimentel%2C+T">Tiago Pimentel</a>, <a href="/search/cs?searchtype=author&query=Wiesner%2C+M">Matthew Wiesner</a>, <a href="/search/cs?searchtype=author&query=Cotterell%2C+R">Ryan Cotterell</a>, <a href="/search/cs?searchtype=author&query=Black%2C+A+W">Alan W Black</a>, <a href="/search/cs?searchtype=author&query=Eisner%2C+J">Jason Eisner</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2005.13962v1-abstract-short" style="display: inline;"> A major hurdle in data-driven research on typology is having sufficient data in many languages to draw meaningful conclusions. We present VoxClamantis v1.0, the first large-scale corpus for phonetic typology, with aligned segments and estimated phoneme-level labels in 690 readings spanning 635 languages, along with acoustic-phonetic measures of vowels and sibilants. Access to such data can greatly… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2005.13962v1-abstract-full').style.display = 'inline'; document.getElementById('2005.13962v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2005.13962v1-abstract-full" style="display: none;"> A major hurdle in data-driven research on typology is having sufficient data in many languages to draw meaningful conclusions. We present VoxClamantis v1.0, the first large-scale corpus for phonetic typology, with aligned segments and estimated phoneme-level labels in 690 readings spanning 635 languages, along with acoustic-phonetic measures of vowels and sibilants. Access to such data can greatly facilitate investigation of phonetic typology at a large scale and across many languages. However, it is non-trivial and computationally intensive to obtain such alignments for hundreds of languages, many of which have few to no resources presently available. We describe the methodology to create our corpus, discuss caveats with current methods and their impact on the utility of this data, and illustrate possible research directions through a series of case studies on the 48 highest-quality readings. Our corpus and scripts are publicly available for non-commercial use at https://voxclamantisproject.github.io. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2005.13962v1-abstract-full').style.display = 'none'; document.getElementById('2005.13962v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 May, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ACL2020</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1910.12299">arXiv:1910.12299</a> <span> [<a href="https://arxiv.org/pdf/1910.12299">pdf</a>, <a href="https://arxiv.org/format/1910.12299">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Induced Inflection-Set Keyword Search in Speech </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Adams%2C+O">Oliver Adams</a>, <a href="/search/cs?searchtype=author&query=Wiesner%2C+M">Matthew Wiesner</a>, <a href="/search/cs?searchtype=author&query=Trmal%2C+J">Jan Trmal</a>, <a href="/search/cs?searchtype=author&query=Nicolai%2C+G">Garrett Nicolai</a>, <a href="/search/cs?searchtype=author&query=Yarowsky%2C+D">David Yarowsky</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1910.12299v2-abstract-short" style="display: inline;"> We investigate the problem of searching for a lexeme-set in speech by searching for its inflectional variants. Experimental results indicate how lexeme-set search performance changes with the number of hypothesized inflections, while ablation experiments highlight the relative importance of different components in the lexeme-set search pipeline and the value of using curated inflectional paradigms… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1910.12299v2-abstract-full').style.display = 'inline'; document.getElementById('1910.12299v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1910.12299v2-abstract-full" style="display: none;"> We investigate the problem of searching for a lexeme-set in speech by searching for its inflectional variants. Experimental results indicate how lexeme-set search performance changes with the number of hypothesized inflections, while ablation experiments highlight the relative importance of different components in the lexeme-set search pipeline and the value of using curated inflectional paradigms. We provide a recipe and evaluation set for the community to use as an extrinsic measure of the performance of inflection generation approaches. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1910.12299v2-abstract-full').style.display = 'none'; document.getElementById('1910.12299v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 May, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 October, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To appear in SIGMORPHON 2020</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1904.02210">arXiv:1904.02210</a> <span> [<a href="https://arxiv.org/pdf/1904.02210">pdf</a>, <a href="https://arxiv.org/format/1904.02210">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Massively Multilingual Adversarial Speech Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Adams%2C+O">Oliver Adams</a>, <a href="/search/cs?searchtype=author&query=Wiesner%2C+M">Matthew Wiesner</a>, <a href="/search/cs?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a>, <a href="/search/cs?searchtype=author&query=Yarowsky%2C+D">David Yarowsky</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1904.02210v1-abstract-short" style="display: inline;"> We report on adaptation of multilingual end-to-end speech recognition models trained on as many as 100 languages. Our findings shed light on the relative importance of similarity between the target and pretraining languages along the dimensions of phonetics, phonology, language family, geographical location, and orthography. In this context, experiments demonstrate the effectiveness of two additio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1904.02210v1-abstract-full').style.display = 'inline'; document.getElementById('1904.02210v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1904.02210v1-abstract-full" style="display: none;"> We report on adaptation of multilingual end-to-end speech recognition models trained on as many as 100 languages. Our findings shed light on the relative importance of similarity between the target and pretraining languages along the dimensions of phonetics, phonology, language family, geographical location, and orthography. In this context, experiments demonstrate the effectiveness of two additional pretraining objectives in encouraging language-independent encoder representations: a context-independent phoneme objective paired with a language-adversarial classification objective. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1904.02210v1-abstract-full').style.display = 'none'; document.getElementById('1904.02210v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 April, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at NAACL-HLT 2019</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1812.03919">arXiv:1812.03919</a> <span> [<a href="https://arxiv.org/pdf/1812.03919">pdf</a>, <a href="https://arxiv.org/format/1812.03919">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Pretraining by Backtranslation for End-to-end ASR in Low-Resource Settings </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wiesner%2C+M">Matthew Wiesner</a>, <a href="/search/cs?searchtype=author&query=Renduchintala%2C+A">Adithya Renduchintala</a>, <a href="/search/cs?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+C">Chunxi Liu</a>, <a href="/search/cs?searchtype=author&query=Dehak%2C+N">Najim Dehak</a>, <a href="/search/cs?searchtype=author&query=Khudanpur%2C+S">Sanjeev Khudanpur</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1812.03919v2-abstract-short" style="display: inline;"> We explore training attention-based encoder-decoder ASR in low-resource settings. These models perform poorly when trained on small amounts of transcribed speech, in part because they depend on having sufficient target-side text to train the attention and decoder networks. In this paper we address this shortcoming by pretraining our network parameters using only text-based data and transcribed spe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1812.03919v2-abstract-full').style.display = 'inline'; document.getElementById('1812.03919v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1812.03919v2-abstract-full" style="display: none;"> We explore training attention-based encoder-decoder ASR in low-resource settings. These models perform poorly when trained on small amounts of transcribed speech, in part because they depend on having sufficient target-side text to train the attention and decoder networks. In this paper we address this shortcoming by pretraining our network parameters using only text-based data and transcribed speech from other languages. We analyze the relative contributions of both sources of data. Across 3 test languages, our text-based approach resulted in a 20% average relative improvement over a text-based augmentation technique without pretraining. Using transcribed speech from nearby languages gives a further 20-30% relative reduction in character error rate. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1812.03919v2-abstract-full').style.display = 'none'; document.getElementById('1812.03919v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 August, 2019; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 December, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2018. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1811.03451">arXiv:1811.03451</a> <span> [<a href="https://arxiv.org/pdf/1811.03451">pdf</a>, <a href="https://arxiv.org/format/1811.03451">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Analysis of Multilingual Sequence-to-Sequence speech recognition systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Karafi%C3%A1t%2C+M">Martin Karafi谩t</a>, <a href="/search/cs?searchtype=author&query=Baskar%2C+M+K">Murali Karthick Baskar</a>, <a href="/search/cs?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a>, <a href="/search/cs?searchtype=author&query=Hori%2C+T">Takaaki Hori</a>, <a href="/search/cs?searchtype=author&query=Wiesner%2C+M">Matthew Wiesner</a>, <a href="/search/cs?searchtype=author&query=%C4%8Cernock%C3%BD%2C+J+%22">Jan "Honza'' 膶ernock媒</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1811.03451v1-abstract-short" style="display: inline;"> This paper investigates the applications of various multilingual approaches developed in conventional hidden Markov model (HMM) systems to sequence-to-sequence (seq2seq) automatic speech recognition (ASR). On a set composed of Babel data, we first show the effectiveness of multi-lingual training with stacked bottle-neck (SBN) features. Then we explore various architectures and training strategies… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1811.03451v1-abstract-full').style.display = 'inline'; document.getElementById('1811.03451v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1811.03451v1-abstract-full" style="display: none;"> This paper investigates the applications of various multilingual approaches developed in conventional hidden Markov model (HMM) systems to sequence-to-sequence (seq2seq) automatic speech recognition (ASR). On a set composed of Babel data, we first show the effectiveness of multi-lingual training with stacked bottle-neck (SBN) features. Then we explore various architectures and training strategies of multi-lingual seq2seq models based on CTC-attention networks including combinations of output layer, CTC and/or attention component re-training. We also investigate the effectiveness of language-transfer learning in a very low resource scenario when the target language is not included in the original multi-lingual training data. Interestingly, we found multilingual features superior to multilingual models, and this finding suggests that we can efficiently combine the benefits of the HMM system with the seq2seq system through these multilingual feature techniques. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1811.03451v1-abstract-full').style.display = 'none'; document.getElementById('1811.03451v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2018. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">arXiv admin note: text overlap with arXiv:1810.03459</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1810.03459">arXiv:1810.03459</a> <span> [<a href="https://arxiv.org/pdf/1810.03459">pdf</a>, <a href="https://arxiv.org/format/1810.03459">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Multilingual sequence-to-sequence speech recognition: architecture, transfer learning, and language modeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cho%2C+J">Jaejin Cho</a>, <a href="/search/cs?searchtype=author&query=Baskar%2C+M+K">Murali Karthick Baskar</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Ruizhi Li</a>, <a href="/search/cs?searchtype=author&query=Wiesner%2C+M">Matthew Wiesner</a>, <a href="/search/cs?searchtype=author&query=Mallidi%2C+S+H">Sri Harish Mallidi</a>, <a href="/search/cs?searchtype=author&query=Yalta%2C+N">Nelson Yalta</a>, <a href="/search/cs?searchtype=author&query=Karafiat%2C+M">Martin Karafiat</a>, <a href="/search/cs?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a>, <a href="/search/cs?searchtype=author&query=Hori%2C+T">Takaaki Hori</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1810.03459v1-abstract-short" style="display: inline;"> Sequence-to-sequence (seq2seq) approach for low-resource ASR is a relatively new direction in speech research. The approach benefits by performing model training without using lexicon and alignments. However, this poses a new problem of requiring more data compared to conventional DNN-HMM systems. In this work, we attempt to use data from 10 BABEL languages to build a multi-lingual seq2seq model a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1810.03459v1-abstract-full').style.display = 'inline'; document.getElementById('1810.03459v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1810.03459v1-abstract-full" style="display: none;"> Sequence-to-sequence (seq2seq) approach for low-resource ASR is a relatively new direction in speech research. The approach benefits by performing model training without using lexicon and alignments. However, this poses a new problem of requiring more data compared to conventional DNN-HMM systems. In this work, we attempt to use data from 10 BABEL languages to build a multi-lingual seq2seq model as a prior model, and then port them towards 4 other BABEL languages using transfer learning approach. We also explore different architectures for improving the prior multilingual seq2seq model. The paper also discusses the effect of integrating a recurrent neural network language model (RNNLM) with a seq2seq model during decoding. Experimental results show that the transfer learning approach from the multilingual model shows substantial gains over monolingual models across all 4 BABEL languages. Incorporating an RNNLM also brings significant improvements in terms of %WER, and achieves recognition performance comparable to the models trained with twice more training data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1810.03459v1-abstract-full').style.display = 'none'; document.getElementById('1810.03459v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 October, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2018. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1807.06204">arXiv:1807.06204</a> <span> [<a href="https://arxiv.org/pdf/1807.06204">pdf</a>, <a href="https://arxiv.org/format/1807.06204">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Low-Resource Contextual Topic Identification on Speech </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+C">Chunxi Liu</a>, <a href="/search/cs?searchtype=author&query=Wiesner%2C+M">Matthew Wiesner</a>, <a href="/search/cs?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a>, <a href="/search/cs?searchtype=author&query=Harman%2C+C">Craig Harman</a>, <a href="/search/cs?searchtype=author&query=Trmal%2C+J">Jan Trmal</a>, <a href="/search/cs?searchtype=author&query=Dehak%2C+N">Najim Dehak</a>, <a href="/search/cs?searchtype=author&query=Khudanpur%2C+S">Sanjeev Khudanpur</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1807.06204v2-abstract-short" style="display: inline;"> In topic identification (topic ID) on real-world unstructured audio, an audio instance of variable topic shifts is first broken into sequential segments, and each segment is independently classified. We first present a general purpose method for topic ID on spoken segments in low-resource languages, using a cascade of universal acoustic modeling, translation lexicons to English, and English-langua… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1807.06204v2-abstract-full').style.display = 'inline'; document.getElementById('1807.06204v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1807.06204v2-abstract-full" style="display: none;"> In topic identification (topic ID) on real-world unstructured audio, an audio instance of variable topic shifts is first broken into sequential segments, and each segment is independently classified. We first present a general purpose method for topic ID on spoken segments in low-resource languages, using a cascade of universal acoustic modeling, translation lexicons to English, and English-language topic classification. Next, instead of classifying each segment independently, we demonstrate that exploring the contextual dependencies across sequential segments can provide large improvements. In particular, we propose an attention-based contextual model which is able to leverage the contexts in a selective manner. We test both our contextual and non-contextual models on four LORELEI languages, and on all but one our attention-based contextual model significantly outperforms the context-independent models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1807.06204v2-abstract-full').style.display = 'none'; document.getElementById('1807.06204v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 September, 2018; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 July, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2018. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted for publication at 2018 IEEE Workshop on Spoken Language Technology (SLT)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1804.00015">arXiv:1804.00015</a> <span> [<a href="https://arxiv.org/pdf/1804.00015">pdf</a>, <a href="https://arxiv.org/format/1804.00015">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> ESPnet: End-to-End Speech Processing Toolkit </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a>, <a href="/search/cs?searchtype=author&query=Hori%2C+T">Takaaki Hori</a>, <a href="/search/cs?searchtype=author&query=Karita%2C+S">Shigeki Karita</a>, <a href="/search/cs?searchtype=author&query=Hayashi%2C+T">Tomoki Hayashi</a>, <a href="/search/cs?searchtype=author&query=Nishitoba%2C+J">Jiro Nishitoba</a>, <a href="/search/cs?searchtype=author&query=Unno%2C+Y">Yuya Unno</a>, <a href="/search/cs?searchtype=author&query=Soplin%2C+N+E+Y">Nelson Enrique Yalta Soplin</a>, <a href="/search/cs?searchtype=author&query=Heymann%2C+J">Jahn Heymann</a>, <a href="/search/cs?searchtype=author&query=Wiesner%2C+M">Matthew Wiesner</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+N">Nanxin Chen</a>, <a href="/search/cs?searchtype=author&query=Renduchintala%2C+A">Adithya Renduchintala</a>, <a href="/search/cs?searchtype=author&query=Ochiai%2C+T">Tsubasa Ochiai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1804.00015v1-abstract-short" style="display: inline;"> This paper introduces a new open source platform for end-to-end speech processing named ESPnet. ESPnet mainly focuses on end-to-end automatic speech recognition (ASR), and adopts widely-used dynamic neural network toolkits, Chainer and PyTorch, as a main deep learning engine. ESPnet also follows the Kaldi ASR toolkit style for data processing, feature extraction/format, and recipes to provide a co… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1804.00015v1-abstract-full').style.display = 'inline'; document.getElementById('1804.00015v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1804.00015v1-abstract-full" style="display: none;"> This paper introduces a new open source platform for end-to-end speech processing named ESPnet. ESPnet mainly focuses on end-to-end automatic speech recognition (ASR), and adopts widely-used dynamic neural network toolkits, Chainer and PyTorch, as a main deep learning engine. ESPnet also follows the Kaldi ASR toolkit style for data processing, feature extraction/format, and recipes to provide a complete setup for speech recognition and other speech processing experiments. This paper explains a major architecture of this software platform, several important functionalities, which differentiate ESPnet from other open source ASR toolkits, and experimental results with major ASR benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1804.00015v1-abstract-full').style.display = 'none'; document.getElementById('1804.00015v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 March, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2018. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1803.10299">arXiv:1803.10299</a> <span> [<a href="https://arxiv.org/pdf/1803.10299">pdf</a>, <a href="https://arxiv.org/format/1803.10299">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Multi-Modal Data Augmentation for End-to-End ASR </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Renduchintala%2C+A">Adithya Renduchintala</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+S">Shuoyang Ding</a>, <a href="/search/cs?searchtype=author&query=Wiesner%2C+M">Matthew Wiesner</a>, <a href="/search/cs?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1803.10299v3-abstract-short" style="display: inline;"> We present a new end-to-end architecture for automatic speech recognition (ASR) that can be trained using \emph{symbolic} input in addition to the traditional acoustic input. This architecture utilizes two separate encoders: one for acoustic input and another for symbolic input, both sharing the attention and decoder parameters. We call this architecture a multi-modal data augmentation network (MM… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1803.10299v3-abstract-full').style.display = 'inline'; document.getElementById('1803.10299v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1803.10299v3-abstract-full" style="display: none;"> We present a new end-to-end architecture for automatic speech recognition (ASR) that can be trained using \emph{symbolic} input in addition to the traditional acoustic input. This architecture utilizes two separate encoders: one for acoustic input and another for symbolic input, both sharing the attention and decoder parameters. We call this architecture a multi-modal data augmentation network (MMDA), as it can support multi-modal (acoustic and symbolic) input and enables seamless mixing of large text datasets with significantly smaller transcribed speech corpora during training. We study different ways of transforming large text corpora into a symbolic form suitable for training our MMDA network. Our best MMDA setup obtains small improvements on character error rate (CER), and as much as 7-10\% relative word error rate (WER) improvement over a baseline both with and without an external language model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1803.10299v3-abstract-full').style.display = 'none'; document.getElementById('1803.10299v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 June, 2018; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 March, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2018. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 Pages, 1 Figure, accepted at INTERSPEECH 2018</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1802.08731">arXiv:1802.08731</a> <span> [<a href="https://arxiv.org/pdf/1802.08731">pdf</a>, <a href="https://arxiv.org/format/1802.08731">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Automatic Speech Recognition and Topic Identification for Almost-Zero-Resource Languages </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wiesner%2C+M">Matthew Wiesner</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+C">Chunxi Liu</a>, <a href="/search/cs?searchtype=author&query=Ondel%2C+L">Lucas Ondel</a>, <a href="/search/cs?searchtype=author&query=Harman%2C+C">Craig Harman</a>, <a href="/search/cs?searchtype=author&query=Manohar%2C+V">Vimal Manohar</a>, <a href="/search/cs?searchtype=author&query=Trmal%2C+J">Jan Trmal</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Zhongqiang Huang</a>, <a href="/search/cs?searchtype=author&query=Dehak%2C+N">Najim Dehak</a>, <a href="/search/cs?searchtype=author&query=Khudanpur%2C+S">Sanjeev Khudanpur</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1802.08731v2-abstract-short" style="display: inline;"> Automatic speech recognition (ASR) systems often need to be developed for extremely low-resource languages to serve end-uses such as audio content categorization and search. While universal phone recognition is natural to consider when no transcribed speech is available to train an ASR system in a language, adapting universal phone models using very small amounts (minutes rather than hours) of tra… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1802.08731v2-abstract-full').style.display = 'inline'; document.getElementById('1802.08731v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1802.08731v2-abstract-full" style="display: none;"> Automatic speech recognition (ASR) systems often need to be developed for extremely low-resource languages to serve end-uses such as audio content categorization and search. While universal phone recognition is natural to consider when no transcribed speech is available to train an ASR system in a language, adapting universal phone models using very small amounts (minutes rather than hours) of transcribed speech also needs to be studied, particularly with state-of-the-art DNN-based acoustic models. The DARPA LORELEI program provides a framework for such very-low-resource ASR studies, and provides an extrinsic metric for evaluating ASR performance in a humanitarian assistance, disaster relief setting. This paper presents our Kaldi-based systems for the program, which employ a universal phone modeling approach to ASR, and describes recipes for very rapid adaptation of this universal ASR system. The results we obtain significantly outperform results obtained by many competing approaches on the NIST LoReHLT 2017 Evaluation datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1802.08731v2-abstract-full').style.display = 'none'; document.getElementById('1802.08731v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 June, 2018; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 February, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2018. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted for publication at Interspeech 2018</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1703.07476">arXiv:1703.07476</a> <span> [<a href="https://arxiv.org/pdf/1703.07476">pdf</a>, <a href="https://arxiv.org/format/1703.07476">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Topic Identification for Speech without ASR </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+C">Chunxi Liu</a>, <a href="/search/cs?searchtype=author&query=Trmal%2C+J">Jan Trmal</a>, <a href="/search/cs?searchtype=author&query=Wiesner%2C+M">Matthew Wiesner</a>, <a href="/search/cs?searchtype=author&query=Harman%2C+C">Craig Harman</a>, <a href="/search/cs?searchtype=author&query=Khudanpur%2C+S">Sanjeev Khudanpur</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1703.07476v2-abstract-short" style="display: inline;"> Modern topic identification (topic ID) systems for speech use automatic speech recognition (ASR) to produce speech transcripts, and perform supervised classification on such ASR outputs. However, under resource-limited conditions, the manually transcribed speech required to develop standard ASR systems can be severely limited or unavailable. In this paper, we investigate alternative unsupervised s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1703.07476v2-abstract-full').style.display = 'inline'; document.getElementById('1703.07476v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1703.07476v2-abstract-full" style="display: none;"> Modern topic identification (topic ID) systems for speech use automatic speech recognition (ASR) to produce speech transcripts, and perform supervised classification on such ASR outputs. However, under resource-limited conditions, the manually transcribed speech required to develop standard ASR systems can be severely limited or unavailable. In this paper, we investigate alternative unsupervised solutions to obtaining tokenizations of speech in terms of a vocabulary of automatically discovered word-like or phoneme-like units, without depending on the supervised training of ASR systems. Moreover, using automatic phoneme-like tokenizations, we demonstrate that a convolutional neural network based framework for learning spoken document representations provides competitive performance compared to a standard bag-of-words representation, as evidenced by comprehensive topic ID evaluations on both single-label and multi-label classification tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1703.07476v2-abstract-full').style.display = 'none'; document.getElementById('1703.07476v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 July, 2017; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 March, 2017; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2017. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 2 figures; accepted for publication at Interspeech 2017</span> </p> </li> </ol> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository