Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–19 of 19 results for author: <span class="mathjax">Soltau, H</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Soltau%2C+H">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Soltau, H"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Soltau%2C+H&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Soltau, H"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.15396">arXiv:2412.15396</a> <span> [<a href="https://arxiv.org/pdf/2412.15396">pdf</a>, <a href="https://arxiv.org/format/2412.15396">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Learning Visual Composition through Improved Semantic Guidance </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Stone%2C+A">Austin Stone</a>, <a href="/search/cs?searchtype=author&query=Soltau%2C+H">Hagen Soltau</a>, <a href="/search/cs?searchtype=author&query=Geirhos%2C+R">Robert Geirhos</a>, <a href="/search/cs?searchtype=author&query=Yi%2C+X">Xi Yi</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+Y">Ye Xia</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+B">Bingyi Cao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+K">Kaifeng Chen</a>, <a href="/search/cs?searchtype=author&query=Ogale%2C+A">Abhijit Ogale</a>, <a href="/search/cs?searchtype=author&query=Shlens%2C+J">Jonathon Shlens</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.15396v1-abstract-short" style="display: inline;"> Visual imagery does not consist of solitary objects, but instead reflects the composition of a multitude of fluid concepts. While there have been great advances in visual representation learning, such advances have focused on building better representations for a small number of discrete objects bereft of an understanding of how these objects are interacting. One can observe this limitation in rep… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.15396v1-abstract-full').style.display = 'inline'; document.getElementById('2412.15396v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.15396v1-abstract-full" style="display: none;"> Visual imagery does not consist of solitary objects, but instead reflects the composition of a multitude of fluid concepts. While there have been great advances in visual representation learning, such advances have focused on building better representations for a small number of discrete objects bereft of an understanding of how these objects are interacting. One can observe this limitation in representations learned through captions or contrastive learning -- where the learned model treats an image essentially as a bag of words. Several works have attempted to address this limitation through the development of bespoke learned architectures to directly address the shortcomings in compositional learning. In this work, we focus on simple, and scalable approaches. In particular, we demonstrate that by substantially improving weakly labeled data, i.e. captions, we can vastly improve the performance of standard contrastive learning approaches. Previous CLIP models achieved near chance rate on challenging tasks probing compositional learning. However, our simple approach boosts performance of CLIP substantially and surpasses all bespoke architectures. Furthermore, we showcase our results on a relatively new captioning benchmark derived from DOCCI. We demonstrate through a series of ablations that a standard CLIP model trained with enhanced data may demonstrate impressive performance on image retrieval tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.15396v1-abstract-full').style.display = 'none'; document.getElementById('2412.15396v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.13640">arXiv:2405.13640</a> <span> [<a href="https://arxiv.org/pdf/2405.13640">pdf</a>, <a href="https://arxiv.org/format/2405.13640">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TASLPRO.2025.3540648">10.1109/TASLPRO.2025.3540648 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Knowledge Graph Reasoning with Self-supervised Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ma%2C+Y">Ying Ma</a>, <a href="/search/cs?searchtype=author&query=Burns%2C+O">Owen Burns</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Mingqiu Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Gang Li</a>, <a href="/search/cs?searchtype=author&query=Du%2C+N">Nan Du</a>, <a href="/search/cs?searchtype=author&query=Shafey%2C+L+E">Laurent El Shafey</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Liqiang Wang</a>, <a href="/search/cs?searchtype=author&query=Shafran%2C+I">Izhak Shafran</a>, <a href="/search/cs?searchtype=author&query=Soltau%2C+H">Hagen Soltau</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.13640v1-abstract-short" style="display: inline;"> Reinforcement learning (RL) is an effective method of finding reasoning pathways in incomplete knowledge graphs (KGs). To overcome the challenges of a large action space, a self-supervised pre-training method is proposed to warm up the policy network before the RL training stage. To alleviate the distributional mismatch issue in general self-supervised RL (SSRL), in our supervised learning (SL) st… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.13640v1-abstract-full').style.display = 'inline'; document.getElementById('2405.13640v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.13640v1-abstract-full" style="display: none;"> Reinforcement learning (RL) is an effective method of finding reasoning pathways in incomplete knowledge graphs (KGs). To overcome the challenges of a large action space, a self-supervised pre-training method is proposed to warm up the policy network before the RL training stage. To alleviate the distributional mismatch issue in general self-supervised RL (SSRL), in our supervised learning (SL) stage, the agent selects actions based on the policy network and learns from generated labels; this self-generation of labels is the intuition behind the name self-supervised. With this training framework, the information density of our SL objective is increased and the agent is prevented from getting stuck with the early rewarded paths. Our self-supervised RL (SSRL) method improves the performance of RL by pairing it with the wide coverage achieved by SL during pretraining, since the breadth of the SL objective makes it infeasible to train an agent with that alone. We show that our SSRL model meets or exceeds current state-of-the-art results on all Hits@k and mean reciprocal rank (MRR) metrics on four large benchmark KG datasets. This SSRL method can be used as a plug-in for any RL architecture for a KGR task. We adopt two RL architectures, i.e., MINERVA and MultiHopKG as our baseline RL models and experimentally show that our SSRL model consistently outperforms both baselines on all of these four KG reasoning tasks. Full code for the paper available at https://github.com/owenonline/Knowledge-Graph-Reasoning-with-Self-supervised-Reinforcement-Learning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.13640v1-abstract-full').style.display = 'none'; document.getElementById('2405.13640v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages, 11 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> IEEE Transactions on Audio, Speech and Language Processing </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.01828">arXiv:2402.01828</a> <span> [<a href="https://arxiv.org/pdf/2402.01828">pdf</a>, <a href="https://arxiv.org/format/2402.01828">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Retrieval Augmented End-to-End Spoken Dialog Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+M">Mingqiu Wang</a>, <a href="/search/cs?searchtype=author&query=Shafran%2C+I">Izhak Shafran</a>, <a href="/search/cs?searchtype=author&query=Soltau%2C+H">Hagen Soltau</a>, <a href="/search/cs?searchtype=author&query=Han%2C+W">Wei Han</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Y">Yuan Cao</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+D">Dian Yu</a>, <a href="/search/cs?searchtype=author&query=Shafey%2C+L+E">Laurent El Shafey</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.01828v1-abstract-short" style="display: inline;"> We recently developed SLM, a joint speech and language model, which fuses a pretrained foundational speech model and a large language model (LLM), while preserving the in-context learning capability intrinsic to the pretrained LLM. In this paper, we apply SLM to speech dialog applications where the dialog states are inferred directly from the audio signal. Task-oriented dialogs often contain dom… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.01828v1-abstract-full').style.display = 'inline'; document.getElementById('2402.01828v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.01828v1-abstract-full" style="display: none;"> We recently developed SLM, a joint speech and language model, which fuses a pretrained foundational speech model and a large language model (LLM), while preserving the in-context learning capability intrinsic to the pretrained LLM. In this paper, we apply SLM to speech dialog applications where the dialog states are inferred directly from the audio signal. Task-oriented dialogs often contain domain-specific entities, i.e., restaurants, hotels, train stations, and city names, which are difficult to recognize, however, critical for the downstream applications. Inspired by the RAG (retrieval-augmented generation) paradigm, we propose a retrieval augmented SLM (ReSLM) that overcomes this weakness. We first train a speech retriever to retrieve text entities mentioned in the audio. The retrieved entities are then added as text inputs to the underlying SLM to bias model predictions. We evaluated ReSLM on speech MultiWoz task (DSTC-11 challenge), and found that this retrieval augmentation boosts model performance, achieving joint goal accuracy (38.6% vs 32.7%), slot error rate (20.6% vs 24.8%) and ASR word error rate (5.5% vs 6.7%). While demonstrated on dialog state tracking, our approach is broadly applicable to other speech tasks requiring contextual information or domain-specific entities, such as contextual ASR with biasing capability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.01828v1-abstract-full').style.display = 'none'; document.getElementById('2402.01828v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Proc. ICASSP 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.13010">arXiv:2310.13010</a> <span> [<a href="https://arxiv.org/pdf/2310.13010">pdf</a>, <a href="https://arxiv.org/format/2310.13010">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Detecting Speech Abnormalities with a Perceiver-based Sequence Classifier that Leverages a Universal Speech Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Soltau%2C+H">Hagen Soltau</a>, <a href="/search/cs?searchtype=author&query=Shafran%2C+I">Izhak Shafran</a>, <a href="/search/cs?searchtype=author&query=Ottenwess%2C+A">Alex Ottenwess</a>, <a href="/search/cs?searchtype=author&query=Duffy%2C+J+R+J">Joseph R. JR Duffy</a>, <a href="/search/cs?searchtype=author&query=Utianski%2C+R+L">Rene L. Utianski</a>, <a href="/search/cs?searchtype=author&query=Barnard%2C+L+R">Leland R. Barnard</a>, <a href="/search/cs?searchtype=author&query=Stricker%2C+J+L">John L. Stricker</a>, <a href="/search/cs?searchtype=author&query=Wiepert%2C+D">Daniela Wiepert</a>, <a href="/search/cs?searchtype=author&query=Jones%2C+D+T">David T. Jones</a>, <a href="/search/cs?searchtype=author&query=Botha%2C+H">Hugo Botha</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.13010v1-abstract-short" style="display: inline;"> We propose a Perceiver-based sequence classifier to detect abnormalities in speech reflective of several neurological disorders. We combine this classifier with a Universal Speech Model (USM) that is trained (unsupervised) on 12 million hours of diverse audio recordings. Our model compresses long sequences into a small set of class-specific latent representations and a factorized projection is use… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.13010v1-abstract-full').style.display = 'inline'; document.getElementById('2310.13010v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.13010v1-abstract-full" style="display: none;"> We propose a Perceiver-based sequence classifier to detect abnormalities in speech reflective of several neurological disorders. We combine this classifier with a Universal Speech Model (USM) that is trained (unsupervised) on 12 million hours of diverse audio recordings. Our model compresses long sequences into a small set of class-specific latent representations and a factorized projection is used to predict different attributes of the disordered input speech. The benefit of our approach is that it allows us to model different regions of the input for different classes and is at the same time data efficient. We evaluated the proposed model extensively on a curated corpus from the Mayo Clinic. Our model outperforms standard transformer (80.9%) and perceiver (81.8%) models and achieves an average accuracy of 83.1%. With limited task-specific data, we find that pretraining is important and surprisingly pretraining with the unrelated automatic speech recognition (ASR) task is also beneficial. Encodings from the middle layers provide a mix of both acoustic and phonetic information and achieve best prediction results compared to just using the final layer encodings (83.1% vs. 79.6%). The results are promising and with further refinements may help clinicians detect speech abnormalities without needing access to highly specialized speech-language pathologists. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.13010v1-abstract-full').style.display = 'none'; document.getElementById('2310.13010v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Proc. ASRU, 2023 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.00230">arXiv:2310.00230</a> <span> [<a href="https://arxiv.org/pdf/2310.00230">pdf</a>, <a href="https://arxiv.org/format/2310.00230">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> SLM: Bridge the thin gap between speech and text foundation models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+M">Mingqiu Wang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+W">Wei Han</a>, <a href="/search/cs?searchtype=author&query=Shafran%2C+I">Izhak Shafran</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zelin Wu</a>, <a href="/search/cs?searchtype=author&query=Chiu%2C+C">Chung-Cheng Chiu</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Y">Yuan Cao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yongqiang Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+N">Nanxin Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yu Zhang</a>, <a href="/search/cs?searchtype=author&query=Soltau%2C+H">Hagen Soltau</a>, <a href="/search/cs?searchtype=author&query=Rubenstein%2C+P">Paul Rubenstein</a>, <a href="/search/cs?searchtype=author&query=Zilka%2C+L">Lukas Zilka</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+D">Dian Yu</a>, <a href="/search/cs?searchtype=author&query=Meng%2C+Z">Zhong Meng</a>, <a href="/search/cs?searchtype=author&query=Pundak%2C+G">Golan Pundak</a>, <a href="/search/cs?searchtype=author&query=Siddhartha%2C+N">Nikhil Siddhartha</a>, <a href="/search/cs?searchtype=author&query=Schalkwyk%2C+J">Johan Schalkwyk</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yonghui Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.00230v1-abstract-short" style="display: inline;"> We present a joint Speech and Language Model (SLM), a multitask, multilingual, and dual-modal model that takes advantage of pretrained foundational speech and language models. SLM freezes the pretrained foundation models to maximally preserves their capabilities, and only trains a simple adapter with just 1\% (156M) of the foundation models' parameters. This adaptation not only leads SLM to achiev… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.00230v1-abstract-full').style.display = 'inline'; document.getElementById('2310.00230v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.00230v1-abstract-full" style="display: none;"> We present a joint Speech and Language Model (SLM), a multitask, multilingual, and dual-modal model that takes advantage of pretrained foundational speech and language models. SLM freezes the pretrained foundation models to maximally preserves their capabilities, and only trains a simple adapter with just 1\% (156M) of the foundation models' parameters. This adaptation not only leads SLM to achieve strong performance on conventional tasks such as speech recognition (ASR) and speech translation (AST), but also introduces the novel capability of zero-shot instruction-following for more diverse tasks: given a speech input and a text instruction, SLM is able to perform unseen generation tasks including contextual biasing ASR using real-time context, dialog generation, speech continuation, and question answering, etc. Our approach demonstrates that the representational gap between pretrained speech and language models might be narrower than one would expect, and can be bridged by a simple adaptation mechanism. As a result, SLM is not only efficient to train, but also inherits strong capabilities already acquired in foundation models of different modalities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.00230v1-abstract-full').style.display = 'none'; document.getElementById('2310.00230v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.08131">arXiv:2306.08131</a> <span> [<a href="https://arxiv.org/pdf/2306.08131">pdf</a>, <a href="https://arxiv.org/format/2306.08131">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Efficient Adapters for Giant Speech Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+N">Nanxin Chen</a>, <a href="/search/cs?searchtype=author&query=Shafran%2C+I">Izhak Shafran</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yu Zhang</a>, <a href="/search/cs?searchtype=author&query=Chiu%2C+C">Chung-Cheng Chiu</a>, <a href="/search/cs?searchtype=author&query=Soltau%2C+H">Hagen Soltau</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+J">James Qin</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yonghui Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.08131v1-abstract-short" style="display: inline;"> Large pre-trained speech models are widely used as the de-facto paradigm, especially in scenarios when there is a limited amount of labeled data available. However, finetuning all parameters from the self-supervised learned model can be computationally expensive, and becomes infeasiable as the size of the model and the number of downstream tasks scales. In this paper, we propose a novel approach c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.08131v1-abstract-full').style.display = 'inline'; document.getElementById('2306.08131v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.08131v1-abstract-full" style="display: none;"> Large pre-trained speech models are widely used as the de-facto paradigm, especially in scenarios when there is a limited amount of labeled data available. However, finetuning all parameters from the self-supervised learned model can be computationally expensive, and becomes infeasiable as the size of the model and the number of downstream tasks scales. In this paper, we propose a novel approach called Two Parallel Adapter (TPA) that is inserted into the conformer-based model pre-trained model instead. TPA is based on systematic studies of the residual adapter, a popular approach for finetuning a subset of parameters. We evaluate TPA on various public benchmarks and experiment results demonstrates its superior performance, which is close to the full finetuning on different datasets and speech tasks. These results show that TPA is an effective and efficient approach for serving large pre-trained speech models. Ablation studies show that TPA can also be pruned, especially for lower blocks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.08131v1-abstract-full').style.display = 'none'; document.getElementById('2306.08131v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.07944">arXiv:2306.07944</a> <span> [<a href="https://arxiv.org/pdf/2306.07944">pdf</a>, <a href="https://arxiv.org/format/2306.07944">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Speech-to-Text Adapter and Speech-to-Entity Retriever Augmented LLMs for Speech Understanding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+M">Mingqiu Wang</a>, <a href="/search/cs?searchtype=author&query=Shafran%2C+I">Izhak Shafran</a>, <a href="/search/cs?searchtype=author&query=Soltau%2C+H">Hagen Soltau</a>, <a href="/search/cs?searchtype=author&query=Han%2C+W">Wei Han</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Y">Yuan Cao</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+D">Dian Yu</a>, <a href="/search/cs?searchtype=author&query=Shafey%2C+L+E">Laurent El Shafey</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.07944v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) have been applied in the speech domain, often incurring a performance drop due to misaligned between speech and language representations. To bridge this gap, we propose a joint speech and language model (SLM) using a Speech2Text adapter, which maps speech into text token embedding space without speech information loss. Additionally, using a CTC-based blank-filtering, w… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.07944v1-abstract-full').style.display = 'inline'; document.getElementById('2306.07944v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.07944v1-abstract-full" style="display: none;"> Large Language Models (LLMs) have been applied in the speech domain, often incurring a performance drop due to misaligned between speech and language representations. To bridge this gap, we propose a joint speech and language model (SLM) using a Speech2Text adapter, which maps speech into text token embedding space without speech information loss. Additionally, using a CTC-based blank-filtering, we can reduce the speech sequence length to that of text. In speech MultiWoz dataset (DSTC11 challenge), SLM largely improves the dialog state tracking (DST) performance (24.7% to 28.4% accuracy). Further to address errors on rare entities, we augment SLM with a Speech2Entity retriever, which uses speech to retrieve relevant entities, and then adds them to the original SLM input as a prefix. With this retrieval-augmented SLM (ReSLM), the DST performance jumps to 34.6% accuracy. Moreover, augmenting the ASR task with the dialog understanding task improves the ASR performance from 9.4% to 8.5% WER. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.07944v1-abstract-full').style.display = 'none'; document.getElementById('2306.07944v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2303.01037">arXiv:2303.01037</a> <span> [<a href="https://arxiv.org/pdf/2303.01037">pdf</a>, <a href="https://arxiv.org/format/2303.01037">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Google USM: Scaling Automatic Speech Recognition Beyond 100 Languages </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yu Zhang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+W">Wei Han</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+J">James Qin</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yongqiang Wang</a>, <a href="/search/cs?searchtype=author&query=Bapna%2C+A">Ankur Bapna</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhehuai Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+N">Nanxin Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Bo Li</a>, <a href="/search/cs?searchtype=author&query=Axelrod%2C+V">Vera Axelrod</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Gary Wang</a>, <a href="/search/cs?searchtype=author&query=Meng%2C+Z">Zhong Meng</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+K">Ke Hu</a>, <a href="/search/cs?searchtype=author&query=Rosenberg%2C+A">Andrew Rosenberg</a>, <a href="/search/cs?searchtype=author&query=Prabhavalkar%2C+R">Rohit Prabhavalkar</a>, <a href="/search/cs?searchtype=author&query=Park%2C+D+S">Daniel S. Park</a>, <a href="/search/cs?searchtype=author&query=Haghani%2C+P">Parisa Haghani</a>, <a href="/search/cs?searchtype=author&query=Riesa%2C+J">Jason Riesa</a>, <a href="/search/cs?searchtype=author&query=Perng%2C+G">Ginger Perng</a>, <a href="/search/cs?searchtype=author&query=Soltau%2C+H">Hagen Soltau</a>, <a href="/search/cs?searchtype=author&query=Strohman%2C+T">Trevor Strohman</a>, <a href="/search/cs?searchtype=author&query=Ramabhadran%2C+B">Bhuvana Ramabhadran</a>, <a href="/search/cs?searchtype=author&query=Sainath%2C+T">Tara Sainath</a>, <a href="/search/cs?searchtype=author&query=Moreno%2C+P">Pedro Moreno</a>, <a href="/search/cs?searchtype=author&query=Chiu%2C+C">Chung-Cheng Chiu</a>, <a href="/search/cs?searchtype=author&query=Schalkwyk%2C+J">Johan Schalkwyk</a> , et al. (2 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2303.01037v3-abstract-short" style="display: inline;"> We introduce the Universal Speech Model (USM), a single large model that performs automatic speech recognition (ASR) across 100+ languages. This is achieved by pre-training the encoder of the model on a large unlabeled multilingual dataset of 12 million (M) hours spanning over 300 languages, and fine-tuning on a smaller labeled dataset. We use multilingual pre-training with random-projection quant… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.01037v3-abstract-full').style.display = 'inline'; document.getElementById('2303.01037v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2303.01037v3-abstract-full" style="display: none;"> We introduce the Universal Speech Model (USM), a single large model that performs automatic speech recognition (ASR) across 100+ languages. This is achieved by pre-training the encoder of the model on a large unlabeled multilingual dataset of 12 million (M) hours spanning over 300 languages, and fine-tuning on a smaller labeled dataset. We use multilingual pre-training with random-projection quantization and speech-text modality matching to achieve state-of-the-art performance on downstream multilingual ASR and speech-to-text translation tasks. We also demonstrate that despite using a labeled training set 1/7-th the size of that used for the Whisper model, our model exhibits comparable or better performance on both in-domain and out-of-domain speech recognition tasks across many languages. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.01037v3-abstract-full').style.display = 'none'; document.getElementById('2303.01037v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">20 pages, 7 figures, 8 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2212.09939">arXiv:2212.09939</a> <span> [<a href="https://arxiv.org/pdf/2212.09939">pdf</a>, <a href="https://arxiv.org/format/2212.09939">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> AnyTOD: A Programmable Task-Oriented Dialog System </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+J">Jeffrey Zhao</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Y">Yuan Cao</a>, <a href="/search/cs?searchtype=author&query=Gupta%2C+R">Raghav Gupta</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+H">Harrison Lee</a>, <a href="/search/cs?searchtype=author&query=Rastogi%2C+A">Abhinav Rastogi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Mingqiu Wang</a>, <a href="/search/cs?searchtype=author&query=Soltau%2C+H">Hagen Soltau</a>, <a href="/search/cs?searchtype=author&query=Shafran%2C+I">Izhak Shafran</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yonghui Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2212.09939v2-abstract-short" style="display: inline;"> We propose AnyTOD, an end-to-end, zero-shot task-oriented dialog (TOD) system capable of handling unseen tasks without task-specific training. We view TOD as a program executed by a language model (LM), where program logic and ontology is provided by a designer as a schema. To enable generalization to unseen schemas and programs without prior training, AnyTOD adopts a neuro-symbolic approach. A ne… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.09939v2-abstract-full').style.display = 'inline'; document.getElementById('2212.09939v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2212.09939v2-abstract-full" style="display: none;"> We propose AnyTOD, an end-to-end, zero-shot task-oriented dialog (TOD) system capable of handling unseen tasks without task-specific training. We view TOD as a program executed by a language model (LM), where program logic and ontology is provided by a designer as a schema. To enable generalization to unseen schemas and programs without prior training, AnyTOD adopts a neuro-symbolic approach. A neural LM keeps track of events occurring during a conversation and a symbolic program implementing the dialog policy is executed to recommend next actions AnyTOD should take. This approach drastically reduces data annotation and model training requirements, addressing the enduring challenge of rapidly adapting a TOD system to unseen tasks and domains. We demonstrate state-of-the-art results on STAR, ABCD and SGD benchmarks. We also demonstrate strong zero-shot transfer ability in low-resource settings, such as zero-shot on MultiWOZ. In addition, we release STARv2, an updated version of the STAR dataset with richer annotations, for benchmarking zero-shot end-to-end TOD models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.09939v2-abstract-full').style.display = 'none'; document.getElementById('2212.09939v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 December, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">v2, update with Multiwoz, SGD results</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2212.08704">arXiv:2212.08704</a> <span> [<a href="https://arxiv.org/pdf/2212.08704">pdf</a>, <a href="https://arxiv.org/format/2212.08704">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Speech Aware Dialog System Technology Challenge (DSTC11) </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Soltau%2C+H">Hagen Soltau</a>, <a href="/search/cs?searchtype=author&query=Shafran%2C+I">Izhak Shafran</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Mingqiu Wang</a>, <a href="/search/cs?searchtype=author&query=Rastogi%2C+A">Abhinav Rastogi</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+J">Jeffrey Zhao</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+Y">Ye Jia</a>, <a href="/search/cs?searchtype=author&query=Han%2C+W">Wei Han</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Y">Yuan Cao</a>, <a href="/search/cs?searchtype=author&query=Miranda%2C+A">Aramys Miranda</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2212.08704v1-abstract-short" style="display: inline;"> Most research on task oriented dialog modeling is based on written text input. However, users interact with practical dialog systems often using speech as input. Typically, systems convert speech into text using an Automatic Speech Recognition (ASR) system, introducing errors. Furthermore, these systems do not address the differences in written and spoken language. The research on this topic is st… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.08704v1-abstract-full').style.display = 'inline'; document.getElementById('2212.08704v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2212.08704v1-abstract-full" style="display: none;"> Most research on task oriented dialog modeling is based on written text input. However, users interact with practical dialog systems often using speech as input. Typically, systems convert speech into text using an Automatic Speech Recognition (ASR) system, introducing errors. Furthermore, these systems do not address the differences in written and spoken language. The research on this topic is stymied by the lack of a public corpus. Motivated by these considerations, our goal in hosting the speech-aware dialog state tracking challenge was to create a public corpus or task which can be used to investigate the performance gap between the written and spoken forms of input, develop models that could alleviate this gap, and establish whether Text-to-Speech-based (TTS) systems is a reasonable surrogate to the more-labor intensive human data collection. We created three spoken versions of the popular written-domain MultiWoz task -- (a) TTS-Verbatim: written user inputs were converted into speech waveforms using a TTS system, (b) Human-Verbatim: humans spoke the user inputs verbatim, and (c) Human-paraphrased: humans paraphrased the user inputs. Additionally, we provided different forms of ASR output to encourage wider participation from teams that may not have access to state-of-the-art ASR systems. These included ASR transcripts, word time stamps, and latent representations of the audio (audio encoder outputs). In this paper, we describe the corpus, report results from participating teams, provide preliminary analyses of their results, and summarize the current state-of-the-art in this domain. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.08704v1-abstract-full').style.display = 'none'; document.getElementById('2212.08704v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 December, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2210.06656">arXiv:2210.06656</a> <span> [<a href="https://arxiv.org/pdf/2210.06656">pdf</a>, <a href="https://arxiv.org/format/2210.06656">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Knowledge-grounded Dialog State Tracking </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+D">Dian Yu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Mingqiu Wang</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Y">Yuan Cao</a>, <a href="/search/cs?searchtype=author&query=Shafran%2C+I">Izhak Shafran</a>, <a href="/search/cs?searchtype=author&query=Shafey%2C+L+E">Laurent El Shafey</a>, <a href="/search/cs?searchtype=author&query=Soltau%2C+H">Hagen Soltau</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2210.06656v1-abstract-short" style="display: inline;"> Knowledge (including structured knowledge such as schema and ontology, and unstructured knowledge such as web corpus) is a critical part of dialog understanding, especially for unseen tasks and domains. Traditionally, such domain-specific knowledge is encoded implicitly into model parameters for the execution of downstream tasks, which makes training inefficient. In addition, such models are not e… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.06656v1-abstract-full').style.display = 'inline'; document.getElementById('2210.06656v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2210.06656v1-abstract-full" style="display: none;"> Knowledge (including structured knowledge such as schema and ontology, and unstructured knowledge such as web corpus) is a critical part of dialog understanding, especially for unseen tasks and domains. Traditionally, such domain-specific knowledge is encoded implicitly into model parameters for the execution of downstream tasks, which makes training inefficient. In addition, such models are not easily transferable to new tasks with different schemas. In this work, we propose to perform dialog state tracking grounded on knowledge encoded externally. We query relevant knowledge of various forms based on the dialog context where such information can ground the prediction of dialog states. We demonstrate superior performance of our proposed method over strong baselines, especially in the few-shot learning setting. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.06656v1-abstract-full').style.display = 'none'; document.getElementById('2210.06656v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">EMNLP 2022 Findings</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2205.04515">arXiv:2205.04515</a> <span> [<a href="https://arxiv.org/pdf/2205.04515">pdf</a>, <a href="https://arxiv.org/format/2205.04515">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Unsupervised Slot Schema Induction for Task-oriented Dialog </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+D">Dian Yu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Mingqiu Wang</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Y">Yuan Cao</a>, <a href="/search/cs?searchtype=author&query=Shafran%2C+I">Izhak Shafran</a>, <a href="/search/cs?searchtype=author&query=Shafey%2C+L+E">Laurent El Shafey</a>, <a href="/search/cs?searchtype=author&query=Soltau%2C+H">Hagen Soltau</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2205.04515v1-abstract-short" style="display: inline;"> Carefully-designed schemas describing how to collect and annotate dialog corpora are a prerequisite towards building task-oriented dialog systems. In practical applications, manually designing schemas can be error-prone, laborious, iterative, and slow, especially when the schema is complicated. To alleviate this expensive and time consuming process, we propose an unsupervised approach for slot sch… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2205.04515v1-abstract-full').style.display = 'inline'; document.getElementById('2205.04515v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2205.04515v1-abstract-full" style="display: none;"> Carefully-designed schemas describing how to collect and annotate dialog corpora are a prerequisite towards building task-oriented dialog systems. In practical applications, manually designing schemas can be error-prone, laborious, iterative, and slow, especially when the schema is complicated. To alleviate this expensive and time consuming process, we propose an unsupervised approach for slot schema induction from unlabeled dialog corpora. Leveraging in-domain language models and unsupervised parsing structures, our data-driven approach extracts candidate slots without constraints, followed by coarse-to-fine clustering to induce slot types. We compare our method against several strong supervised baselines, and show significant performance improvement in slot schema induction on MultiWoz and SGD datasets. We also demonstrate the effectiveness of induced schemas on downstream applications including dialog state tracking and response generation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2205.04515v1-abstract-full').style.display = 'none'; document.getElementById('2205.04515v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 May, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NAACL 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2203.03543">arXiv:2203.03543</a> <span> [<a href="https://arxiv.org/pdf/2203.03543">pdf</a>, <a href="https://arxiv.org/format/2203.03543">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> RNN Transducers for Nested Named Entity Recognition with constraints on alignment for long sequences </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Soltau%2C+H">Hagen Soltau</a>, <a href="/search/cs?searchtype=author&query=Shafran%2C+I">Izhak Shafran</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Mingqiu Wang</a>, <a href="/search/cs?searchtype=author&query=Shafey%2C+L+E">Laurent El Shafey</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2203.03543v1-abstract-short" style="display: inline;"> Popular solutions to Named Entity Recognition (NER) include conditional random fields, sequence-to-sequence models, or utilizing the question-answering framework. However, they are not suitable for nested and overlapping spans with large ontologies and for predicting the position of the entities. To fill this gap, we introduce a new model for NER task -- an RNN transducer (RNN-T). These models are… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.03543v1-abstract-full').style.display = 'inline'; document.getElementById('2203.03543v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2203.03543v1-abstract-full" style="display: none;"> Popular solutions to Named Entity Recognition (NER) include conditional random fields, sequence-to-sequence models, or utilizing the question-answering framework. However, they are not suitable for nested and overlapping spans with large ontologies and for predicting the position of the entities. To fill this gap, we introduce a new model for NER task -- an RNN transducer (RNN-T). These models are trained using paired input and output sequences without explicitly specifying the alignment between them, similar to other seq-to-seq models. RNN-T models learn the alignment using a loss function that sums over all alignments. In NER tasks, however, the alignment between words and target labels are available from the human annotations. We propose a fixed alignment RNN-T model that utilizes the given alignment, while preserving the benefits of RNN-Ts such as modeling output dependencies. As a more general case, we also propose a constrained alignment model where users can specify a relaxation of the given input alignment and the model will learn an alignment within the given constraints. In other words, we propose a family of seq-to-seq models which can leverage alignments between input and target sequences when available. Through empirical experiments on a challenging real-world medical NER task with multiple nested ontologies, we demonstrate that our fixed alignment model outperforms the standard RNN-T model, improving F1-score from 0.70 to 0.74. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.03543v1-abstract-full').style.display = 'none'; document.getElementById('2203.03543v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 February, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2110.15222">arXiv:2110.15222</a> <span> [<a href="https://arxiv.org/pdf/2110.15222">pdf</a>, <a href="https://arxiv.org/format/2110.15222">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Word-level confidence estimation for RNN transducers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+M">Mingqiu Wang</a>, <a href="/search/cs?searchtype=author&query=Soltau%2C+H">Hagen Soltau</a>, <a href="/search/cs?searchtype=author&query=Shafey%2C+L+E">Laurent El Shafey</a>, <a href="/search/cs?searchtype=author&query=Shafran%2C+I">Izhak Shafran</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2110.15222v1-abstract-short" style="display: inline;"> Confidence estimate is an often requested feature in applications such as medical transcription where errors can impact patient care and the confidence estimate could be used to alert medical professionals to verify potential errors in recognition. In this paper, we present a lightweight neural confidence model tailored for Automatic Speech Recognition (ASR) system with Recurrent Neural Network… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.15222v1-abstract-full').style.display = 'inline'; document.getElementById('2110.15222v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2110.15222v1-abstract-full" style="display: none;"> Confidence estimate is an often requested feature in applications such as medical transcription where errors can impact patient care and the confidence estimate could be used to alert medical professionals to verify potential errors in recognition. In this paper, we present a lightweight neural confidence model tailored for Automatic Speech Recognition (ASR) system with Recurrent Neural Network Transducers (RNN-T). Compared to other existing approaches, our model utilizes: (a) the time information associated with recognized words, which reduces the computational complexity, and (b) a simple and elegant trick for mapping between sub-word and word sequences. The mapping addresses the non-unique tokenization and token deletion problems while amplifying differences between confusable words. Through extensive empirical evaluations on two different long-form test sets, we demonstrate that the model achieves a performance of 0.4 Normalized Cross Entropy (NCE) and 0.05 Expected Calibration Error (ECE). It is robust across different ASR configurations, including target types (graphemes vs. morphemes), traffic conditions (streaming vs. non-streaming), and encoder types. We further discuss the importance of evaluation metrics to reflect practical applications and highlight the need for further work in improving Area Under the Curve (AUC) for Negative Precision Rate (NPV) and True Negative Rate (TNR). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.15222v1-abstract-full').style.display = 'none'; document.getElementById('2110.15222v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 September, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Proc. ASRU 2021 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2104.02219">arXiv:2104.02219</a> <span> [<a href="https://arxiv.org/pdf/2104.02219">pdf</a>, <a href="https://arxiv.org/format/2104.02219">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Understanding Medical Conversations: Rich Transcription, Confidence Scores & Information Extraction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Soltau%2C+H">Hagen Soltau</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Mingqiu Wang</a>, <a href="/search/cs?searchtype=author&query=Shafran%2C+I">Izhak Shafran</a>, <a href="/search/cs?searchtype=author&query=Shafey%2C+L+E">Laurent El Shafey</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2104.02219v1-abstract-short" style="display: inline;"> In this paper, we describe novel components for extracting clinically relevant information from medical conversations which will be available as Google APIs. We describe a transformer-based Recurrent Neural Network Transducer (RNN-T) model tailored for long-form audio, which can produce rich transcriptions including speaker segmentation, speaker role labeling, punctuation and capitalization. On a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2104.02219v1-abstract-full').style.display = 'inline'; document.getElementById('2104.02219v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2104.02219v1-abstract-full" style="display: none;"> In this paper, we describe novel components for extracting clinically relevant information from medical conversations which will be available as Google APIs. We describe a transformer-based Recurrent Neural Network Transducer (RNN-T) model tailored for long-form audio, which can produce rich transcriptions including speaker segmentation, speaker role labeling, punctuation and capitalization. On a representative test set, we compare performance of RNN-T models with different encoders, units and streaming constraints. Our transformer-based streaming model performs at about 20% WER on the ASR task, 6% WDER on the diarization task, 43% SER on periods, 52% SER on commas, 43% SER on question marks and 30% SER on capitalization. Our recognizer is paired with a confidence model that utilizes both acoustic and lexical features from the recognizer. The model performs at about 0.37 NCE. Finally, we describe a RNN-T based tagging model. The performance of the model depends on the ontologies, with F-scores of 0.90 for medications, 0.76 for symptoms, 0.75 for conditions, 0.76 for diagnosis, and 0.61 for treatments. While there is still room for improvement, our results suggest that these models are sufficiently accurate for practical applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2104.02219v1-abstract-full').style.display = 'none'; document.getElementById('2104.02219v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 April, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2003.11531">arXiv:2003.11531</a> <span> [<a href="https://arxiv.org/pdf/2003.11531">pdf</a>, <a href="https://arxiv.org/format/2003.11531">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> The Medical Scribe: Corpus Development and Model Performance Analyses </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shafran%2C+I">Izhak Shafran</a>, <a href="/search/cs?searchtype=author&query=Du%2C+N">Nan Du</a>, <a href="/search/cs?searchtype=author&query=Tran%2C+L">Linh Tran</a>, <a href="/search/cs?searchtype=author&query=Perry%2C+A">Amanda Perry</a>, <a href="/search/cs?searchtype=author&query=Keyes%2C+L">Lauren Keyes</a>, <a href="/search/cs?searchtype=author&query=Knichel%2C+M">Mark Knichel</a>, <a href="/search/cs?searchtype=author&query=Domin%2C+A">Ashley Domin</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+L">Lei Huang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yuhui Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Gang Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Mingqiu Wang</a>, <a href="/search/cs?searchtype=author&query=Shafey%2C+L+E">Laurent El Shafey</a>, <a href="/search/cs?searchtype=author&query=Soltau%2C+H">Hagen Soltau</a>, <a href="/search/cs?searchtype=author&query=Paul%2C+J+S">Justin S. Paul</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2003.11531v1-abstract-short" style="display: inline;"> There is a growing interest in creating tools to assist in clinical note generation using the audio of provider-patient encounters. Motivated by this goal and with the help of providers and medical scribes, we developed an annotation scheme to extract relevant clinical concepts. We used this annotation scheme to label a corpus of about 6k clinical encounters. This was used to train a state-of-the-… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2003.11531v1-abstract-full').style.display = 'inline'; document.getElementById('2003.11531v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2003.11531v1-abstract-full" style="display: none;"> There is a growing interest in creating tools to assist in clinical note generation using the audio of provider-patient encounters. Motivated by this goal and with the help of providers and medical scribes, we developed an annotation scheme to extract relevant clinical concepts. We used this annotation scheme to label a corpus of about 6k clinical encounters. This was used to train a state-of-the-art tagging model. We report ontologies, labeling results, model performances, and detailed analyses of the results. Our results show that the entities related to medications can be extracted with a relatively high accuracy of 0.90 F-score, followed by symptoms at 0.72 F-score, and conditions at 0.57 F-score. In our task, we not only identify where the symptoms are mentioned but also map them to canonical forms as they appear in the clinical notes. Of the different types of errors, in about 19-38% of the cases, we find that the model output was correct, and about 17-32% of the errors do not impact the clinical note. Taken together, the models developed in this work are more useful than the F-scores reflect, making it a promising approach for practical applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2003.11531v1-abstract-full').style.display = 'none'; document.getElementById('2003.11531v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 March, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Extended version of the paper accepted at LREC 2020</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Proceedings of Language Resources and Evaluation, 2020 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1907.05337">arXiv:1907.05337</a> <span> [<a href="https://arxiv.org/pdf/1907.05337">pdf</a>, <a href="https://arxiv.org/format/1907.05337">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Joint Speech Recognition and Speaker Diarization via Sequence Transduction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shafey%2C+L+E">Laurent El Shafey</a>, <a href="/search/cs?searchtype=author&query=Soltau%2C+H">Hagen Soltau</a>, <a href="/search/cs?searchtype=author&query=Shafran%2C+I">Izhak Shafran</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1907.05337v1-abstract-short" style="display: inline;"> Speech applications dealing with conversations require not only recognizing the spoken words, but also determining who spoke when. The task of assigning words to speakers is typically addressed by merging the outputs of two separate systems, namely, an automatic speech recognition (ASR) system and a speaker diarization (SD) system. The two systems are trained independently with different objective… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1907.05337v1-abstract-full').style.display = 'inline'; document.getElementById('1907.05337v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1907.05337v1-abstract-full" style="display: none;"> Speech applications dealing with conversations require not only recognizing the spoken words, but also determining who spoke when. The task of assigning words to speakers is typically addressed by merging the outputs of two separate systems, namely, an automatic speech recognition (ASR) system and a speaker diarization (SD) system. The two systems are trained independently with different objective functions. Often the SD systems operate directly on the acoustics and are not constrained to respect word boundaries and this deficiency is overcome in an ad hoc manner. Motivated by recent advances in sequence to sequence learning, we propose a novel approach to tackle the two tasks by a joint ASR and SD system using a recurrent neural network transducer. Our approach utilizes both linguistic and acoustic cues to infer speaker roles, as opposed to typical SD systems, which only use acoustic cues. We evaluated the performance of our approach on a large corpus of medical conversations between physicians and patients. Compared to a competitive conventional baseline, our approach improves word-level diarization error rate from 15.8% to 2.2%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1907.05337v1-abstract-full').style.display = 'none'; document.getElementById('1907.05337v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 July, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Proc. Interspeech 2019 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1610.09975">arXiv:1610.09975</a> <span> [<a href="https://arxiv.org/pdf/1610.09975">pdf</a>, <a href="https://arxiv.org/format/1610.09975">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> </div> </div> <p class="title is-5 mathjax"> Neural Speech Recognizer: Acoustic-to-Word LSTM Model for Large Vocabulary Speech Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Soltau%2C+H">Hagen Soltau</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+H">Hank Liao</a>, <a href="/search/cs?searchtype=author&query=Sak%2C+H">Hasim Sak</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1610.09975v1-abstract-short" style="display: inline;"> We present results that show it is possible to build a competitive, greatly simplified, large vocabulary continuous speech recognition system with whole words as acoustic units. We model the output vocabulary of about 100,000 words directly using deep bi-directional LSTM RNNs with CTC loss. The model is trained on 125,000 hours of semi-supervised acoustic training data, which enables us to allevia… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1610.09975v1-abstract-full').style.display = 'inline'; document.getElementById('1610.09975v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1610.09975v1-abstract-full" style="display: none;"> We present results that show it is possible to build a competitive, greatly simplified, large vocabulary continuous speech recognition system with whole words as acoustic units. We model the output vocabulary of about 100,000 words directly using deep bi-directional LSTM RNNs with CTC loss. The model is trained on 125,000 hours of semi-supervised acoustic training data, which enables us to alleviate the data sparsity problem for word models. We show that the CTC word models work very well as an end-to-end all-neural speech recognition model without the use of traditional context-dependent sub-word phone units that require a pronunciation lexicon, and without any language model removing the need to decode. We demonstrate that the CTC word models perform better than a strong, more complex, state-of-the-art baseline with sub-word units. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1610.09975v1-abstract-full').style.display = 'none'; document.getElementById('1610.09975v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2016; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2016. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1309.1501">arXiv:1309.1501</a> <span> [<a href="https://arxiv.org/pdf/1309.1501">pdf</a>, <a href="https://arxiv.org/ps/1309.1501">ps</a>, <a href="https://arxiv.org/format/1309.1501">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Improvements to deep convolutional neural networks for LVCSR </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sainath%2C+T+N">Tara N. Sainath</a>, <a href="/search/cs?searchtype=author&query=Kingsbury%2C+B">Brian Kingsbury</a>, <a href="/search/cs?searchtype=author&query=Mohamed%2C+A">Abdel-rahman Mohamed</a>, <a href="/search/cs?searchtype=author&query=Dahl%2C+G+E">George E. Dahl</a>, <a href="/search/cs?searchtype=author&query=Saon%2C+G">George Saon</a>, <a href="/search/cs?searchtype=author&query=Soltau%2C+H">Hagen Soltau</a>, <a href="/search/cs?searchtype=author&query=Beran%2C+T">Tomas Beran</a>, <a href="/search/cs?searchtype=author&query=Aravkin%2C+A+Y">Aleksandr Y. Aravkin</a>, <a href="/search/cs?searchtype=author&query=Ramabhadran%2C+B">Bhuvana Ramabhadran</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1309.1501v3-abstract-short" style="display: inline;"> Deep Convolutional Neural Networks (CNNs) are more powerful than Deep Neural Networks (DNN), as they are able to better reduce spectral variation in the input signal. This has also been confirmed experimentally, with CNNs showing improvements in word error rate (WER) between 4-12% relative compared to DNNs across a variety of LVCSR tasks. In this paper, we describe different methods to further imp… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1309.1501v3-abstract-full').style.display = 'inline'; document.getElementById('1309.1501v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1309.1501v3-abstract-full" style="display: none;"> Deep Convolutional Neural Networks (CNNs) are more powerful than Deep Neural Networks (DNN), as they are able to better reduce spectral variation in the input signal. This has also been confirmed experimentally, with CNNs showing improvements in word error rate (WER) between 4-12% relative compared to DNNs across a variety of LVCSR tasks. In this paper, we describe different methods to further improve CNN performance. First, we conduct a deep analysis comparing limited weight sharing and full weight sharing with state-of-the-art features. Second, we apply various pooling strategies that have shown improvements in computer vision to an LVCSR speech task. Third, we introduce a method to effectively incorporate speaker adaptation, namely fMLLR, into log-mel features. Fourth, we introduce an effective strategy to use dropout during Hessian-free sequence training. We find that with these improvements, particularly with fMLLR and dropout, we are able to achieve an additional 2-3% relative improvement in WER on a 50-hour Broadcast News task over our previous best CNN baseline. On a larger 400-hour BN task, we find an additional 4-5% relative improvement over our previous best CNN baseline. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1309.1501v3-abstract-full').style.display = 'none'; document.getElementById('1309.1501v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 December, 2013; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 September, 2013; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2013. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">6 pages, 1 figure</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 65K05; 90C15; 90C90 </p> </li> </ol> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository