Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 97 results for author: <span class="mathjax">Vu, N T</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Vu%2C+N+T">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Vu, N T"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Vu%2C+N+T&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Vu, N T"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Vu%2C+N+T&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Vu%2C+N+T&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Vu%2C+N+T&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.13419">arXiv:2501.13419</a> <span> [<a href="https://arxiv.org/pdf/2501.13419">pdf</a>, <a href="https://arxiv.org/format/2501.13419">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> A Survey of Code-switched Arabic NLP: Progress, Challenges, and Future Directions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hamed%2C+I">Injy Hamed</a>, <a href="/search/cs?searchtype=author&query=Sabty%2C+C">Caroline Sabty</a>, <a href="/search/cs?searchtype=author&query=Abdennadher%2C+S">Slim Abdennadher</a>, <a href="/search/cs?searchtype=author&query=Vu%2C+N+T">Ngoc Thang Vu</a>, <a href="/search/cs?searchtype=author&query=Solorio%2C+T">Thamar Solorio</a>, <a href="/search/cs?searchtype=author&query=Habash%2C+N">Nizar Habash</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.13419v1-abstract-short" style="display: inline;"> Language in the Arab world presents a complex diglossic and multilingual setting, involving the use of Modern Standard Arabic, various dialects and sub-dialects, as well as multiple European languages. This diverse linguistic landscape has given rise to code-switching, both within Arabic varieties and between Arabic and foreign languages. The widespread occurrence of code-switching across the regi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13419v1-abstract-full').style.display = 'inline'; document.getElementById('2501.13419v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.13419v1-abstract-full" style="display: none;"> Language in the Arab world presents a complex diglossic and multilingual setting, involving the use of Modern Standard Arabic, various dialects and sub-dialects, as well as multiple European languages. This diverse linguistic landscape has given rise to code-switching, both within Arabic varieties and between Arabic and foreign languages. The widespread occurrence of code-switching across the region makes it vital to address these linguistic needs when developing language technologies. In this paper, we provide a review of the current literature in the field of code-switched Arabic NLP, offering a broad perspective on ongoing efforts, challenges, research gaps, and recommendations for future research directions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13419v1-abstract-full').style.display = 'none'; document.getElementById('2501.13419v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to COLING 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.08263">arXiv:2412.08263</a> <span> [<a href="https://arxiv.org/pdf/2412.08263">pdf</a>, <a href="https://arxiv.org/format/2412.08263">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Discrete Subgraph Sampling for Interpretable Graph based Visual Question Answering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tilli%2C+P">Pascal Tilli</a>, <a href="/search/cs?searchtype=author&query=Vu%2C+N+T">Ngoc Thang Vu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.08263v1-abstract-short" style="display: inline;"> Explainable artificial intelligence (XAI) aims to make machine learning models more transparent. While many approaches focus on generating explanations post-hoc, interpretable approaches, which generate the explanations intrinsically alongside the predictions, are relatively rare. In this work, we integrate different discrete subset sampling methods into a graph-based visual question answering sys… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.08263v1-abstract-full').style.display = 'inline'; document.getElementById('2412.08263v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.08263v1-abstract-full" style="display: none;"> Explainable artificial intelligence (XAI) aims to make machine learning models more transparent. While many approaches focus on generating explanations post-hoc, interpretable approaches, which generate the explanations intrinsically alongside the predictions, are relatively rare. In this work, we integrate different discrete subset sampling methods into a graph-based visual question answering system to compare their effectiveness in generating interpretable explanatory subgraphs intrinsically. We evaluate the methods on the GQA dataset and show that the integrated methods effectively mitigate the performance trade-off between interpretability and answer accuracy, while also achieving strong co-occurrences between answer and question tokens. Furthermore, we conduct a human evaluation to assess the interpretability of the generated subgraphs using a comparative setting with the extended Bradley-Terry model, showing that the answer and question token co-occurrence metrics strongly correlate with human preferences. Our source code is publicly available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.08263v1-abstract-full').style.display = 'none'; document.getElementById('2412.08263v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at COLING 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.05821">arXiv:2410.05821</a> <span> [<a href="https://arxiv.org/pdf/2410.05821">pdf</a>, <a href="https://arxiv.org/format/2410.05821">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> A Zero-Shot approach to the Conversational Tree Search Task </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=V%C3%A4th%2C+D">Dirk V盲th</a>, <a href="/search/cs?searchtype=author&query=Vu%2C+N+T">Ngoc Thang Vu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.05821v1-abstract-short" style="display: inline;"> In sensitive domains, such as legal or medial domains, the correctness of information given to users is critical. To address this, the recently introduced task Conversational Tree Search (CTS) provides a graph-based framework for controllable task-oriented dialog in sensitive domains. However, a big drawback of state-of-the-art CTS agents is their long training time, which is especially problemati… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05821v1-abstract-full').style.display = 'inline'; document.getElementById('2410.05821v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.05821v1-abstract-full" style="display: none;"> In sensitive domains, such as legal or medial domains, the correctness of information given to users is critical. To address this, the recently introduced task Conversational Tree Search (CTS) provides a graph-based framework for controllable task-oriented dialog in sensitive domains. However, a big drawback of state-of-the-art CTS agents is their long training time, which is especially problematic as a new agent must be trained every time the associated domain graph is updated. The goal of this paper is to eliminate the need for training CTS agents altogether. To achieve this, we implement a novel LLM-based method for zero-shot, controllable CTS agents. We show that these agents significantly outperform state-of-the-art CTS agents (p<0.0001; Barnard Exact test) in simulation. This generalizes to all available CTS domains. Finally, we perform user evaluation to test the agent performance in the wild, showing that our policy significantly (p<0.05; Barnard Exact) improves task-success compared to the state-of-the-art Reinforcement Learning-based CTS agent. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05821v1-abstract-full').style.display = 'none'; document.getElementById('2410.05821v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.11145">arXiv:2409.11145</a> <span> [<a href="https://arxiv.org/pdf/2409.11145">pdf</a>, <a href="https://arxiv.org/format/2409.11145">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> High-Resolution Speech Restoration with Latent Diffusion Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dhyani%2C+T">Tushar Dhyani</a>, <a href="/search/cs?searchtype=author&query=Lux%2C+F">Florian Lux</a>, <a href="/search/cs?searchtype=author&query=Mancusi%2C+M">Michele Mancusi</a>, <a href="/search/cs?searchtype=author&query=Fabbro%2C+G">Giorgio Fabbro</a>, <a href="/search/cs?searchtype=author&query=Hohl%2C+F">Fritz Hohl</a>, <a href="/search/cs?searchtype=author&query=Vu%2C+N+T">Ngoc Thang Vu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.11145v2-abstract-short" style="display: inline;"> Traditional speech enhancement methods often oversimplify the task of restoration by focusing on a single type of distortion. Generative models that handle multiple distortions frequently struggle with phone reconstruction and high-frequency harmonics, leading to breathing and gasping artifacts that reduce the intelligibility of reconstructed speech. These models are also computationally demanding… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.11145v2-abstract-full').style.display = 'inline'; document.getElementById('2409.11145v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.11145v2-abstract-full" style="display: none;"> Traditional speech enhancement methods often oversimplify the task of restoration by focusing on a single type of distortion. Generative models that handle multiple distortions frequently struggle with phone reconstruction and high-frequency harmonics, leading to breathing and gasping artifacts that reduce the intelligibility of reconstructed speech. These models are also computationally demanding, and many solutions are restricted to producing outputs in the wide-band frequency range, which limits their suitability for professional applications. To address these challenges, we propose Hi-ResLDM, a novel generative model based on latent diffusion designed to remove multiple distortions and restore speech recordings to studio quality, sampled at 48kHz. We benchmark Hi-ResLDM against state-of-the-art methods that leverage GAN and Conditional Flow Matching (CFM) components, demonstrating superior performance in regenerating high-frequency-band details. Hi-ResLDM not only excels in non-instrusive metrics but is also consistently preferred in human evaluation and performs competitively on intrusive evaluations, making it ideal for high-resolution speech restoration. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.11145v2-abstract-full').style.display = 'none'; document.getElementById('2409.11145v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.14154">arXiv:2408.14154</a> <span> [<a href="https://arxiv.org/pdf/2408.14154">pdf</a>, <a href="https://arxiv.org/format/2408.14154">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Investigating the effect of Mental Models in User Interaction with an Adaptive Dialog Agent </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Vanderlyn%2C+L">Lindsey Vanderlyn</a>, <a href="/search/cs?searchtype=author&query=V%C3%A4th%2C+D">Dirk V盲th</a>, <a href="/search/cs?searchtype=author&query=Vu%2C+N+T">Ngoc Thang Vu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.14154v1-abstract-short" style="display: inline;"> Mental models play an important role in whether user interaction with intelligent systems, such as dialog systems is successful or not. Adaptive dialog systems present the opportunity to align a dialog agent's behavior with heterogeneous user expectations. However, there has been little research into what mental models users form when interacting with a task-oriented dialog system, how these model… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.14154v1-abstract-full').style.display = 'inline'; document.getElementById('2408.14154v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.14154v1-abstract-full" style="display: none;"> Mental models play an important role in whether user interaction with intelligent systems, such as dialog systems is successful or not. Adaptive dialog systems present the opportunity to align a dialog agent's behavior with heterogeneous user expectations. However, there has been little research into what mental models users form when interacting with a task-oriented dialog system, how these models affect users' interactions, or what role system adaptation can play in this process, making it challenging to avoid damage to human-AI partnership. In this work, we collect a new publicly available dataset for exploring user mental models about information seeking dialog systems. We demonstrate that users have a variety of conflicting mental models about such systems, the validity of which directly impacts the success of their interactions and perceived usability of system. Furthermore, we show that adapting a dialog agent's behavior to better align with users' mental models, even when done implicitly, can improve perceived usability, dialog efficiency, and success. To this end, we argue that implicit adaptation can be a valid strategy for task-oriented dialog systems, so long as developers first have a solid understanding of users' mental models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.14154v1-abstract-full').style.display = 'none'; document.getElementById('2408.14154v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">submitted to COLING 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.14153">arXiv:2408.14153</a> <span> [<a href="https://arxiv.org/pdf/2408.14153">pdf</a>, <a href="https://arxiv.org/format/2408.14153">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Explaining Vision-Language Similarities in Dual Encoders with Feature-Pair Attributions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=M%C3%B6ller%2C+L">Lucas M枚ller</a>, <a href="/search/cs?searchtype=author&query=Tilli%2C+P">Pascal Tilli</a>, <a href="/search/cs?searchtype=author&query=Vu%2C+N+T">Ngoc Thang Vu</a>, <a href="/search/cs?searchtype=author&query=Pad%C3%B3%2C+S">Sebastian Pad贸</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.14153v1-abstract-short" style="display: inline;"> Dual encoder architectures like CLIP models map two types of inputs into a shared embedding space and learn similarities between them. However, it is not understood how such models compare two inputs. Here, we address this research gap with two contributions. First, we derive a method to attribute predictions of any differentiable dual encoder onto feature-pair interactions between its inputs. Sec… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.14153v1-abstract-full').style.display = 'inline'; document.getElementById('2408.14153v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.14153v1-abstract-full" style="display: none;"> Dual encoder architectures like CLIP models map two types of inputs into a shared embedding space and learn similarities between them. However, it is not understood how such models compare two inputs. Here, we address this research gap with two contributions. First, we derive a method to attribute predictions of any differentiable dual encoder onto feature-pair interactions between its inputs. Second, we apply our method to CLIP-type models and show that they learn fine-grained correspondences between parts of captions and regions in images. They match objects across input modes and also account for mismatches. However, this visual-linguistic grounding ability heavily varies between object classes, depends on the training data distribution, and largely improves after in-domain training. Using our method we can identify knowledge gaps about specific object classes in individual models and can monitor their improvement upon fine-tuning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.14153v1-abstract-full').style.display = 'none'; document.getElementById('2408.14153v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.21061">arXiv:2407.21061</a> <span> [<a href="https://arxiv.org/pdf/2407.21061">pdf</a>, <a href="https://arxiv.org/format/2407.21061">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Improving noisy student training for low-resource languages in End-to-End ASR using CycleGAN and inter-domain losses </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+C">Chia-Yu Li</a>, <a href="/search/cs?searchtype=author&query=Vu%2C+N+T">Ngoc Thang Vu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.21061v1-abstract-short" style="display: inline;"> Training a semi-supervised end-to-end speech recognition system using noisy student training has significantly improved performance. However, this approach requires a substantial amount of paired speech-text and unlabeled speech, which is costly for low-resource languages. Therefore, this paper considers a more extreme case of semi-supervised end-to-end automatic speech recognition where there are… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.21061v1-abstract-full').style.display = 'inline'; document.getElementById('2407.21061v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.21061v1-abstract-full" style="display: none;"> Training a semi-supervised end-to-end speech recognition system using noisy student training has significantly improved performance. However, this approach requires a substantial amount of paired speech-text and unlabeled speech, which is costly for low-resource languages. Therefore, this paper considers a more extreme case of semi-supervised end-to-end automatic speech recognition where there are limited paired speech-text, unlabeled speech (less than five hours), and abundant external text. Firstly, we observe improved performance by training the model using our previous work on semi-supervised learning "CycleGAN and inter-domain losses" solely with external text. Secondly, we enhance "CycleGAN and inter-domain losses" by incorporating automatic hyperparameter tuning, calling it "enhanced CycleGAN inter-domain losses." Thirdly, we integrate it into the noisy student training approach pipeline for low-resource scenarios. Our experimental results, conducted on six non-English languages from Voxforge and Common Voice, show a 20% word error rate reduction compared to the baseline teacher model and a 10% word error rate reduction compared to the baseline best student model, highlighting the significant improvements achieved through our proposed method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.21061v1-abstract-full').style.display = 'none'; document.getElementById('2407.21061v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages (2 for references), 4 figures, published in SIGUL2024@LREC-COLING 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.02937">arXiv:2407.02937</a> <span> [<a href="https://arxiv.org/pdf/2407.02937">pdf</a>, <a href="https://arxiv.org/format/2407.02937">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.21437/Interspeech.2024-1615">10.21437/Interspeech.2024-1615 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Probing the Feasibility of Multilingual Speaker Anonymization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Meyer%2C+S">Sarina Meyer</a>, <a href="/search/cs?searchtype=author&query=Lux%2C+F">Florian Lux</a>, <a href="/search/cs?searchtype=author&query=Vu%2C+N+T">Ngoc Thang Vu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.02937v1-abstract-short" style="display: inline;"> In speaker anonymization, speech recordings are modified in a way that the identity of the speaker remains hidden. While this technology could help to protect the privacy of individuals around the globe, current research restricts this by focusing almost exclusively on English data. In this study, we extend a state-of-the-art anonymization system to nine languages by transforming language-dependen… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.02937v1-abstract-full').style.display = 'inline'; document.getElementById('2407.02937v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.02937v1-abstract-full" style="display: none;"> In speaker anonymization, speech recordings are modified in a way that the identity of the speaker remains hidden. While this technology could help to protect the privacy of individuals around the globe, current research restricts this by focusing almost exclusively on English data. In this study, we extend a state-of-the-art anonymization system to nine languages by transforming language-dependent components to their multilingual counterparts. Experiments testing the robustness of the anonymized speech against privacy attacks and speech deterioration show an overall success of this system for all languages. The results suggest that speaker embeddings trained on English data can be applied across languages, and that the anonymization performance for a language is mainly affected by the quality of the speech synthesis component used for it. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.02937v1-abstract-full').style.display = 'none'; document.getElementById('2407.02937v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted at Interspeech 2024</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Proc. Interspeech 2024, pp. 4448-4452 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.06406">arXiv:2406.06406</a> <span> [<a href="https://arxiv.org/pdf/2406.06406">pdf</a>, <a href="https://arxiv.org/format/2406.06406">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Controlling Emotion in Text-to-Speech with Natural Language Prompts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bott%2C+T">Thomas Bott</a>, <a href="/search/cs?searchtype=author&query=Lux%2C+F">Florian Lux</a>, <a href="/search/cs?searchtype=author&query=Vu%2C+N+T">Ngoc Thang Vu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.06406v2-abstract-short" style="display: inline;"> In recent years, prompting has quickly become one of the standard ways of steering the outputs of generative machine learning models, due to its intuitive use of natural language. In this work, we propose a system conditioned on embeddings derived from an emotionally rich text that serves as prompt. Thereby, a joint representation of speaker and prompt embeddings is integrated at several points wi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.06406v2-abstract-full').style.display = 'inline'; document.getElementById('2406.06406v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.06406v2-abstract-full" style="display: none;"> In recent years, prompting has quickly become one of the standard ways of steering the outputs of generative machine learning models, due to its intuitive use of natural language. In this work, we propose a system conditioned on embeddings derived from an emotionally rich text that serves as prompt. Thereby, a joint representation of speaker and prompt embeddings is integrated at several points within a transformer-based architecture. Our approach is trained on merged emotional speech and text datasets and varies prompts in each training iteration to increase the generalization capabilities of the model. Objective and subjective evaluation results demonstrate the ability of the conditioned synthesis system to accurately transfer the emotions present in a prompt to speech. At the same time, precise tractability of speaker identities as well as overall high speech quality and intelligibility are maintained. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.06406v2-abstract-full').style.display = 'none'; document.getElementById('2406.06406v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted at Interspeech 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.06403">arXiv:2406.06403</a> <span> [<a href="https://arxiv.org/pdf/2406.06403">pdf</a>, <a href="https://arxiv.org/format/2406.06403">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Meta Learning Text-to-Speech Synthesis in over 7000 Languages </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lux%2C+F">Florian Lux</a>, <a href="/search/cs?searchtype=author&query=Meyer%2C+S">Sarina Meyer</a>, <a href="/search/cs?searchtype=author&query=Behringer%2C+L">Lyonel Behringer</a>, <a href="/search/cs?searchtype=author&query=Zalkow%2C+F">Frank Zalkow</a>, <a href="/search/cs?searchtype=author&query=Do%2C+P">Phat Do</a>, <a href="/search/cs?searchtype=author&query=Coler%2C+M">Matt Coler</a>, <a href="/search/cs?searchtype=author&query=Habets%2C+E+A+P">Emanu毛l A. P. Habets</a>, <a href="/search/cs?searchtype=author&query=Vu%2C+N+T">Ngoc Thang Vu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.06403v1-abstract-short" style="display: inline;"> In this work, we take on the challenging task of building a single text-to-speech synthesis system that is capable of generating speech in over 7000 languages, many of which lack sufficient data for traditional TTS development. By leveraging a novel integration of massively multilingual pretraining and meta learning to approximate language representations, our approach enables zero-shot speech syn… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.06403v1-abstract-full').style.display = 'inline'; document.getElementById('2406.06403v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.06403v1-abstract-full" style="display: none;"> In this work, we take on the challenging task of building a single text-to-speech synthesis system that is capable of generating speech in over 7000 languages, many of which lack sufficient data for traditional TTS development. By leveraging a novel integration of massively multilingual pretraining and meta learning to approximate language representations, our approach enables zero-shot speech synthesis in languages without any available data. We validate our system's performance through objective measures and human evaluation across a diverse linguistic landscape. By releasing our code and models publicly, we aim to empower communities with limited linguistic resources and foster further innovation in the field of speech technology. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.06403v1-abstract-full').style.display = 'none'; document.getElementById('2406.06403v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted at Interspeech 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.09335">arXiv:2405.09335</a> <span> [<a href="https://arxiv.org/pdf/2405.09335">pdf</a>, <a href="https://arxiv.org/format/2405.09335">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Prompting-based Synthetic Data Generation for Few-Shot Question Answering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Schmidt%2C+M">Maximilian Schmidt</a>, <a href="/search/cs?searchtype=author&query=Bartezzaghi%2C+A">Andrea Bartezzaghi</a>, <a href="/search/cs?searchtype=author&query=Vu%2C+N+T">Ngoc Thang Vu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.09335v1-abstract-short" style="display: inline;"> Although language models (LMs) have boosted the performance of Question Answering, they still need plenty of data. Data annotation, in contrast, is a time-consuming process. This especially applies to Question Answering, where possibly large documents have to be parsed and annotated with questions and their corresponding answers. Furthermore, Question Answering models often only work well for the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.09335v1-abstract-full').style.display = 'inline'; document.getElementById('2405.09335v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.09335v1-abstract-full" style="display: none;"> Although language models (LMs) have boosted the performance of Question Answering, they still need plenty of data. Data annotation, in contrast, is a time-consuming process. This especially applies to Question Answering, where possibly large documents have to be parsed and annotated with questions and their corresponding answers. Furthermore, Question Answering models often only work well for the domain they were trained on. Since annotation is costly, we argue that domain-agnostic knowledge from LMs, such as linguistic understanding, is sufficient to create a well-curated dataset. With this motivation, we show that using large language models can improve Question Answering performance on various datasets in the few-shot setting compared to state-of-the-art approaches. For this, we perform data generation leveraging the Prompting framework, suggesting that language models contain valuable task-agnostic knowledge that can be used beyond the common pre-training/fine-tuning scheme. As a result, we consistently outperform previous approaches on few-shot Question Answering. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.09335v1-abstract-full').style.display = 'none'; document.getElementById('2405.09335v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">LREC-COLING 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.10922">arXiv:2404.10922</a> <span> [<a href="https://arxiv.org/pdf/2404.10922">pdf</a>, <a href="https://arxiv.org/format/2404.10922">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Teaching a Multilingual Large Language Model to Understand Multilingual Speech via Multi-Instructional Training </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Denisov%2C+P">Pavel Denisov</a>, <a href="/search/cs?searchtype=author&query=Vu%2C+N+T">Ngoc Thang Vu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.10922v1-abstract-short" style="display: inline;"> Recent advancements in language modeling have led to the emergence of Large Language Models (LLMs) capable of various natural language processing tasks. Despite their success in text-based tasks, applying LLMs to the speech domain remains limited and challenging. This paper presents BLOOMZMMS, a novel model that integrates a multilingual LLM with a multilingual speech encoder, aiming to harness th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.10922v1-abstract-full').style.display = 'inline'; document.getElementById('2404.10922v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.10922v1-abstract-full" style="display: none;"> Recent advancements in language modeling have led to the emergence of Large Language Models (LLMs) capable of various natural language processing tasks. Despite their success in text-based tasks, applying LLMs to the speech domain remains limited and challenging. This paper presents BLOOMZMMS, a novel model that integrates a multilingual LLM with a multilingual speech encoder, aiming to harness the capabilities of LLMs for speech recognition and beyond. Utilizing a multi-instructional training approach, we demonstrate the transferability of linguistic knowledge from the text to the speech modality. Our experiments, conducted on 1900 hours of transcribed data from 139 languages, establish that a multilingual speech representation can be effectively learned and aligned with a multilingual LLM. While this learned representation initially shows limitations in task generalization, we address this issue by generating synthetic targets in a multi-instructional style. Our zero-shot evaluation results confirm the robustness of our approach across multiple tasks, including speech translation and multilingual spoken language understanding, thereby opening new avenues for applying LLMs in the speech domain. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.10922v1-abstract-full').style.display = 'none'; document.getElementById('2404.10922v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NAACL Findings 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.17647">arXiv:2403.17647</a> <span> [<a href="https://arxiv.org/pdf/2403.17647">pdf</a>, <a href="https://arxiv.org/format/2403.17647">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Intrinsic Subgraph Generation for Interpretable Graph based Visual Question Answering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tilli%2C+P">Pascal Tilli</a>, <a href="/search/cs?searchtype=author&query=Vu%2C+N+T">Ngoc Thang Vu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.17647v2-abstract-short" style="display: inline;"> The large success of deep learning based methods in Visual Question Answering (VQA) has concurrently increased the demand for explainable methods. Most methods in Explainable Artificial Intelligence (XAI) focus on generating post-hoc explanations rather than taking an intrinsic approach, the latter characterizing an interpretable model. In this work, we introduce an interpretable approach for grap… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.17647v2-abstract-full').style.display = 'inline'; document.getElementById('2403.17647v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.17647v2-abstract-full" style="display: none;"> The large success of deep learning based methods in Visual Question Answering (VQA) has concurrently increased the demand for explainable methods. Most methods in Explainable Artificial Intelligence (XAI) focus on generating post-hoc explanations rather than taking an intrinsic approach, the latter characterizing an interpretable model. In this work, we introduce an interpretable approach for graph-based VQA and demonstrate competitive performance on the GQA dataset. This approach bridges the gap between interpretability and performance. Our model is designed to intrinsically produce a subgraph during the question-answering process as its explanation, providing insight into the decision making. To evaluate the quality of these generated subgraphs, we compare them against established post-hoc explainability methods for graph neural networks, and perform a human evaluation. Moreover, we present quantitative metrics that correlate with the evaluations of human assessors, acting as automatic metrics for the generated explanatory subgraphs. Our implementation is available at https://github.com/DigitalPhonetics/Intrinsic-Subgraph-Generation-for-VQA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.17647v2-abstract-full').style.display = 'none'; document.getElementById('2403.17647v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at LREC-COLING 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.17582">arXiv:2403.17582</a> <span> [<a href="https://arxiv.org/pdf/2403.17582">pdf</a>, <a href="https://arxiv.org/format/2403.17582">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Towards a Zero-Data, Controllable, Adaptive Dialog System </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=V%C3%A4th%2C+D">Dirk V盲th</a>, <a href="/search/cs?searchtype=author&query=Vanderlyn%2C+L">Lindsey Vanderlyn</a>, <a href="/search/cs?searchtype=author&query=Vu%2C+N+T">Ngoc Thang Vu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.17582v1-abstract-short" style="display: inline;"> Conversational Tree Search (V盲th et al., 2023) is a recent approach to controllable dialog systems, where domain experts shape the behavior of a Reinforcement Learning agent through a dialog tree. The agent learns to efficiently navigate this tree, while adapting to information needs, e.g., domain familiarity, of different users. However, the need for additional training data hinders deployment in… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.17582v1-abstract-full').style.display = 'inline'; document.getElementById('2403.17582v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.17582v1-abstract-full" style="display: none;"> Conversational Tree Search (V盲th et al., 2023) is a recent approach to controllable dialog systems, where domain experts shape the behavior of a Reinforcement Learning agent through a dialog tree. The agent learns to efficiently navigate this tree, while adapting to information needs, e.g., domain familiarity, of different users. However, the need for additional training data hinders deployment in new domains. To address this, we explore approaches to generate this data directly from dialog trees. We improve the original approach, and show that agents trained on synthetic data can achieve comparable dialog success to models trained on human data, both when using a commercial Large Language Model for generation, or when using a smaller open-source model, running on a single GPU. We further demonstrate the scalability of our approach by collecting and testing on two new datasets: ONBOARD, a new domain helping foreign residents moving to a new city, and the medical domain DIAGNOSE, a subset of Wikipedia articles related to scalp and head symptoms. Finally, we perform human testing, where no statistically significant differences were found in either objective or subjective measures between models trained on human and generated data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.17582v1-abstract-full').style.display = 'none'; document.getElementById('2403.17582v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.05338">arXiv:2403.05338</a> <span> [<a href="https://arxiv.org/pdf/2403.05338">pdf</a>, <a href="https://arxiv.org/format/2403.05338">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Explaining Pre-Trained Language Models with Attribution Scores: An Analysis in Low-Resource Settings </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+W">Wei Zhou</a>, <a href="/search/cs?searchtype=author&query=Adel%2C+H">Heike Adel</a>, <a href="/search/cs?searchtype=author&query=Schuff%2C+H">Hendrik Schuff</a>, <a href="/search/cs?searchtype=author&query=Vu%2C+N+T">Ngoc Thang Vu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.05338v1-abstract-short" style="display: inline;"> Attribution scores indicate the importance of different input parts and can, thus, explain model behaviour. Currently, prompt-based models are gaining popularity, i.a., due to their easier adaptability in low-resource settings. However, the quality of attribution scores extracted from prompt-based models has not been investigated yet. In this work, we address this topic by analyzing attribution sc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.05338v1-abstract-full').style.display = 'inline'; document.getElementById('2403.05338v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.05338v1-abstract-full" style="display: none;"> Attribution scores indicate the importance of different input parts and can, thus, explain model behaviour. Currently, prompt-based models are gaining popularity, i.a., due to their easier adaptability in low-resource settings. However, the quality of attribution scores extracted from prompt-based models has not been investigated yet. In this work, we address this topic by analyzing attribution scores extracted from prompt-based models w.r.t. plausibility and faithfulness and comparing them with attribution scores extracted from fine-tuned models and large language models. In contrast to previous work, we introduce training size as another dimension into the analysis. We find that using the prompting paradigm (with either encoder-based or decoder-based models) yields more plausible explanations than fine-tuning the models in low-resource settings and Shapley Value Sampling consistently outperforms attention and Integrated Gradients in terms of leading to more plausible and faithful explanations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.05338v1-abstract-full').style.display = 'none'; document.getElementById('2403.05338v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.17502">arXiv:2310.17502</a> <span> [<a href="https://arxiv.org/pdf/2310.17502">pdf</a>, <a href="https://arxiv.org/format/2310.17502">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.21437/Interspeech.2023-858">10.21437/Interspeech.2023-858 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Controllable Generation of Artificial Speaker Embeddings through Discovery of Principal Directions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lux%2C+F">Florian Lux</a>, <a href="/search/cs?searchtype=author&query=Tilli%2C+P">Pascal Tilli</a>, <a href="/search/cs?searchtype=author&query=Meyer%2C+S">Sarina Meyer</a>, <a href="/search/cs?searchtype=author&query=Vu%2C+N+T">Ngoc Thang Vu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.17502v1-abstract-short" style="display: inline;"> Customizing voice and speaking style in a speech synthesis system with intuitive and fine-grained controls is challenging, given that little data with appropriate labels is available. Furthermore, editing an existing human's voice also comes with ethical concerns. In this paper, we propose a method to generate artificial speaker embeddings that cannot be linked to a real human while offering intui… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.17502v1-abstract-full').style.display = 'inline'; document.getElementById('2310.17502v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.17502v1-abstract-full" style="display: none;"> Customizing voice and speaking style in a speech synthesis system with intuitive and fine-grained controls is challenging, given that little data with appropriate labels is available. Furthermore, editing an existing human's voice also comes with ethical concerns. In this paper, we propose a method to generate artificial speaker embeddings that cannot be linked to a real human while offering intuitive and fine-grained control over the voice and speaking style of the embeddings, without requiring any labels for speaker or style. The artificial and controllable embeddings can be fed to a speech synthesis system, conditioned on embeddings of real humans during training, without sacrificing privacy during inference. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.17502v1-abstract-full').style.display = 'none'; document.getElementById('2310.17502v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published at ISCA Interspeech 2023 https://www.isca-speech.org/archive/interspeech_2023/lux23_interspeech.html</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.17499">arXiv:2310.17499</a> <span> [<a href="https://arxiv.org/pdf/2310.17499">pdf</a>, <a href="https://arxiv.org/format/2310.17499">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> The IMS Toucan System for the Blizzard Challenge 2023 </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lux%2C+F">Florian Lux</a>, <a href="/search/cs?searchtype=author&query=Koch%2C+J">Julia Koch</a>, <a href="/search/cs?searchtype=author&query=Meyer%2C+S">Sarina Meyer</a>, <a href="/search/cs?searchtype=author&query=Bott%2C+T">Thomas Bott</a>, <a href="/search/cs?searchtype=author&query=Schauffler%2C+N">Nadja Schauffler</a>, <a href="/search/cs?searchtype=author&query=Denisov%2C+P">Pavel Denisov</a>, <a href="/search/cs?searchtype=author&query=Schweitzer%2C+A">Antje Schweitzer</a>, <a href="/search/cs?searchtype=author&query=Vu%2C+N+T">Ngoc Thang Vu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.17499v1-abstract-short" style="display: inline;"> For our contribution to the Blizzard Challenge 2023, we improved on the system we submitted to the Blizzard Challenge 2021. Our approach entails a rule-based text-to-phoneme processing system that includes rule-based disambiguation of homographs in the French language. It then transforms the phonemes to spectrograms as intermediate representations using a fast and efficient non-autoregressive synt… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.17499v1-abstract-full').style.display = 'inline'; document.getElementById('2310.17499v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.17499v1-abstract-full" style="display: none;"> For our contribution to the Blizzard Challenge 2023, we improved on the system we submitted to the Blizzard Challenge 2021. Our approach entails a rule-based text-to-phoneme processing system that includes rule-based disambiguation of homographs in the French language. It then transforms the phonemes to spectrograms as intermediate representations using a fast and efficient non-autoregressive synthesis architecture based on Conformer and Glow. A GAN based neural vocoder that combines recent state-of-the-art approaches converts the spectrogram to the final wave. We carefully designed the data processing, training, and inference procedures for the challenge data. Our system identifier is G. Open source code and demo are available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.17499v1-abstract-full').style.display = 'none'; document.getElementById('2310.17499v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published at the Blizzard Challenge Workshop 2023, colocated with the Speech Synthesis Workshop 2023, a sattelite event of the Interspeech 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.15262">arXiv:2310.15262</a> <span> [<a href="https://arxiv.org/pdf/2310.15262">pdf</a>, <a href="https://arxiv.org/format/2310.15262">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Data Augmentation Techniques for Machine Translation of Code-Switched Texts: A Comparative Study </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hamed%2C+I">Injy Hamed</a>, <a href="/search/cs?searchtype=author&query=Habash%2C+N">Nizar Habash</a>, <a href="/search/cs?searchtype=author&query=Vu%2C+N+T">Ngoc Thang Vu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.15262v1-abstract-short" style="display: inline;"> Code-switching (CSW) text generation has been receiving increasing attention as a solution to address data scarcity. In light of this growing interest, we need more comprehensive studies comparing different augmentation approaches. In this work, we compare three popular approaches: lexical replacements, linguistic theories, and back-translation (BT), in the context of Egyptian Arabic-English CSW.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.15262v1-abstract-full').style.display = 'inline'; document.getElementById('2310.15262v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.15262v1-abstract-full" style="display: none;"> Code-switching (CSW) text generation has been receiving increasing attention as a solution to address data scarcity. In light of this growing interest, we need more comprehensive studies comparing different augmentation approaches. In this work, we compare three popular approaches: lexical replacements, linguistic theories, and back-translation (BT), in the context of Egyptian Arabic-English CSW. We assess the effectiveness of the approaches on machine translation and the quality of augmentations through human evaluation. We show that BT and CSW predictive-based lexical replacement, being trained on CSW parallel data, perform best on both tasks. Linguistic theories and random lexical replacement prove to be effective in the lack of CSW parallel data, where both approaches achieve similar results. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.15262v1-abstract-full').style.display = 'none'; document.getElementById('2310.15262v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Findings of EMNLP 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.06103">arXiv:2310.06103</a> <span> [<a href="https://arxiv.org/pdf/2310.06103">pdf</a>, <a href="https://arxiv.org/format/2310.06103">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Leveraging Multilingual Self-Supervised Pretrained Models for Sequence-to-Sequence End-to-End Spoken Language Understanding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Denisov%2C+P">Pavel Denisov</a>, <a href="/search/cs?searchtype=author&query=Vu%2C+N+T">Ngoc Thang Vu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.06103v1-abstract-short" style="display: inline;"> A number of methods have been proposed for End-to-End Spoken Language Understanding (E2E-SLU) using pretrained models, however their evaluation often lacks multilingual setup and tasks that require prediction of lexical fillers, such as slot filling. In this work, we propose a unified method that integrates multilingual pretrained speech and text models and performs E2E-SLU on six datasets in four… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.06103v1-abstract-full').style.display = 'inline'; document.getElementById('2310.06103v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.06103v1-abstract-full" style="display: none;"> A number of methods have been proposed for End-to-End Spoken Language Understanding (E2E-SLU) using pretrained models, however their evaluation often lacks multilingual setup and tasks that require prediction of lexical fillers, such as slot filling. In this work, we propose a unified method that integrates multilingual pretrained speech and text models and performs E2E-SLU on six datasets in four languages in a generative manner, including the prediction of lexical fillers. We investigate how the proposed method can be improved by pretraining on widely available speech recognition data using several training objectives. Pretraining on 7000 hours of multilingual data allows us to outperform the state-of-the-art ultimately on two SLU datasets and partly on two more SLU datasets. Finally, we examine the cross-lingual capabilities of the proposed model and improve on the best known result on the PortMEDIA-Language dataset by almost half, achieving a Concept/Value Error Rate of 23.65%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.06103v1-abstract-full').style.display = 'none'; document.getElementById('2310.06103v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">IEEE Workshop on Automatic Speech Recognition and Understanding (ASRU) 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.08049">arXiv:2309.08049</a> <span> [<a href="https://arxiv.org/pdf/2309.08049">pdf</a>, <a href="https://arxiv.org/format/2309.08049">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/OJSP.2023.3344375">10.1109/OJSP.2023.3344375 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> VoicePAT: An Efficient Open-source Evaluation Toolkit for Voice Privacy Research </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Meyer%2C+S">Sarina Meyer</a>, <a href="/search/cs?searchtype=author&query=Miao%2C+X">Xiaoxiao Miao</a>, <a href="/search/cs?searchtype=author&query=Vu%2C+N+T">Ngoc Thang Vu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.08049v2-abstract-short" style="display: inline;"> Speaker anonymization is the task of modifying a speech recording such that the original speaker cannot be identified anymore. Since the first Voice Privacy Challenge in 2020, along with the release of a framework, the popularity of this research topic is continually increasing. However, the comparison and combination of different anonymization approaches remains challenging due to the complexity… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.08049v2-abstract-full').style.display = 'inline'; document.getElementById('2309.08049v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.08049v2-abstract-full" style="display: none;"> Speaker anonymization is the task of modifying a speech recording such that the original speaker cannot be identified anymore. Since the first Voice Privacy Challenge in 2020, along with the release of a framework, the popularity of this research topic is continually increasing. However, the comparison and combination of different anonymization approaches remains challenging due to the complexity of evaluation and the absence of user-friendly research frameworks. We therefore propose an efficient speaker anonymization and evaluation framework based on a modular and easily extendable structure, almost fully in Python. The framework facilitates the orchestration of several anonymization approaches in parallel and allows for interfacing between different techniques. Furthermore, we propose modifications to common evaluation methods which improves the quality of the evaluation and reduces their computation time by 65 to 95%, depending on the metric. Our code is fully open source. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.08049v2-abstract-full').style.display = 'none'; document.getElementById('2309.08049v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by OJSP-ICASSP 2024 https://ieeexplore.ieee.org/document/10365329</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.06420">arXiv:2308.06420</a> <span> [<a href="https://arxiv.org/pdf/2308.06420">pdf</a>, <a href="https://arxiv.org/format/2308.06420">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> M&M: Tackling False Positives in Mammography with a Multi-view and Multi-instance Learning Sparse Detector </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Vu%2C+Y+N+T">Yen Nhi Truong Vu</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+D">Dan Guo</a>, <a href="/search/cs?searchtype=author&query=Taha%2C+A">Ahmed Taha</a>, <a href="/search/cs?searchtype=author&query=Su%2C+J">Jason Su</a>, <a href="/search/cs?searchtype=author&query=Matthews%2C+T+P">Thomas Paul Matthews</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.06420v1-abstract-short" style="display: inline;"> Deep-learning-based object detection methods show promise for improving screening mammography, but high rates of false positives can hinder their effectiveness in clinical practice. To reduce false positives, we identify three challenges: (1) unlike natural images, a malignant mammogram typically contains only one malignant finding; (2) mammography exams contain two views of each breast, and both… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.06420v1-abstract-full').style.display = 'inline'; document.getElementById('2308.06420v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.06420v1-abstract-full" style="display: none;"> Deep-learning-based object detection methods show promise for improving screening mammography, but high rates of false positives can hinder their effectiveness in clinical practice. To reduce false positives, we identify three challenges: (1) unlike natural images, a malignant mammogram typically contains only one malignant finding; (2) mammography exams contain two views of each breast, and both views ought to be considered to make a correct assessment; (3) most mammograms are negative and do not contain any findings. In this work, we tackle the three aforementioned challenges by: (1) leveraging Sparse R-CNN and showing that sparse detectors are more appropriate than dense detectors for mammography; (2) including a multi-view cross-attention module to synthesize information from different views; (3) incorporating multi-instance learning (MIL) to train with unannotated images and perform breast-level classification. The resulting model, M&M, is a Multi-view and Multi-instance learning system that can both localize malignant findings and provide breast-level predictions. We validate M&M's detection and classification performance using five mammography datasets. In addition, we demonstrate the effectiveness of each proposed component through comprehensive ablation studies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.06420v1-abstract-full').style.display = 'none'; document.getElementById('2308.06420v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">MICCAI 2023 with supplementary materials</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.06804">arXiv:2306.06804</a> <span> [<a href="https://arxiv.org/pdf/2306.06804">pdf</a>, <a href="https://arxiv.org/format/2306.06804">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Neural Machine Translation for the Indigenous Languages of the Americas: An Introduction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mager%2C+M">Manuel Mager</a>, <a href="/search/cs?searchtype=author&query=Bhatnagar%2C+R">Rajat Bhatnagar</a>, <a href="/search/cs?searchtype=author&query=Neubig%2C+G">Graham Neubig</a>, <a href="/search/cs?searchtype=author&query=Vu%2C+N+T">Ngoc Thang Vu</a>, <a href="/search/cs?searchtype=author&query=Kann%2C+K">Katharina Kann</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.06804v1-abstract-short" style="display: inline;"> Neural models have drastically advanced state of the art for machine translation (MT) between high-resource languages. Traditionally, these models rely on large amounts of training data, but many language pairs lack these resources. However, an important part of the languages in the world do not have this amount of data. Most languages from the Americas are among them, having a limited amount of p… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.06804v1-abstract-full').style.display = 'inline'; document.getElementById('2306.06804v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.06804v1-abstract-full" style="display: none;"> Neural models have drastically advanced state of the art for machine translation (MT) between high-resource languages. Traditionally, these models rely on large amounts of training data, but many language pairs lack these resources. However, an important part of the languages in the world do not have this amount of data. Most languages from the Americas are among them, having a limited amount of parallel and monolingual data, if any. Here, we present an introduction to the interested reader to the basic challenges, concepts, and techniques that involve the creation of MT systems for these languages. Finally, we discuss the recent advances and findings and open questions, product of an increased interest of the NLP community in these languages. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.06804v1-abstract-full').style.display = 'none'; document.getElementById('2306.06804v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to AmericasNLP 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.19474">arXiv:2305.19474</a> <span> [<a href="https://arxiv.org/pdf/2305.19474">pdf</a>, <a href="https://arxiv.org/format/2305.19474">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Ethical Considerations for Machine Translation of Indigenous Languages: Giving a Voice to the Speakers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mager%2C+M">Manuel Mager</a>, <a href="/search/cs?searchtype=author&query=Mager%2C+E">Elisabeth Mager</a>, <a href="/search/cs?searchtype=author&query=Kann%2C+K">Katharina Kann</a>, <a href="/search/cs?searchtype=author&query=Vu%2C+N+T">Ngoc Thang Vu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.19474v1-abstract-short" style="display: inline;"> In recent years machine translation has become very successful for high-resource language pairs. This has also sparked new interest in research on the automatic translation of low-resource languages, including Indigenous languages. However, the latter are deeply related to the ethnic and cultural groups that speak (or used to speak) them. The data collection, modeling and deploying machine transla… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.19474v1-abstract-full').style.display = 'inline'; document.getElementById('2305.19474v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.19474v1-abstract-full" style="display: none;"> In recent years machine translation has become very successful for high-resource language pairs. This has also sparked new interest in research on the automatic translation of low-resource languages, including Indigenous languages. However, the latter are deeply related to the ethnic and cultural groups that speak (or used to speak) them. The data collection, modeling and deploying machine translation systems thus result in new ethical questions that must be addressed. Motivated by this, we first survey the existing literature on ethical considerations for the documentation, translation, and general natural language processing for Indigenous languages. Afterward, we conduct and analyze an interview study to shed light on the positions of community leaders, teachers, and language activists regarding ethical concerns for the automatic translation of their languages. Our results show that the inclusion, at different degrees, of native speakers and community members is vital to performing better and more ethical research on Indigenous languages. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.19474v1-abstract-full').style.display = 'none'; document.getElementById('2305.19474v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ACL2023 Main Conference</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.02679">arXiv:2305.02679</a> <span> [<a href="https://arxiv.org/pdf/2305.02679">pdf</a>, <a href="https://arxiv.org/format/2305.02679">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Neighboring Words Affect Human Interpretation of Saliency Explanations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jacovi%2C+A">Alon Jacovi</a>, <a href="/search/cs?searchtype=author&query=Schuff%2C+H">Hendrik Schuff</a>, <a href="/search/cs?searchtype=author&query=Adel%2C+H">Heike Adel</a>, <a href="/search/cs?searchtype=author&query=Vu%2C+N+T">Ngoc Thang Vu</a>, <a href="/search/cs?searchtype=author&query=Goldberg%2C+Y">Yoav Goldberg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.02679v2-abstract-short" style="display: inline;"> Word-level saliency explanations ("heat maps over words") are often used to communicate feature-attribution in text-based models. Recent studies found that superficial factors such as word length can distort human interpretation of the communicated saliency scores. We conduct a user study to investigate how the marking of a word's neighboring words affect the explainee's perception of the word's i… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.02679v2-abstract-full').style.display = 'inline'; document.getElementById('2305.02679v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.02679v2-abstract-full" style="display: none;"> Word-level saliency explanations ("heat maps over words") are often used to communicate feature-attribution in text-based models. Recent studies found that superficial factors such as word length can distort human interpretation of the communicated saliency scores. We conduct a user study to investigate how the marking of a word's neighboring words affect the explainee's perception of the word's importance in the context of a saliency explanation. We find that neighboring words have significant effects on the word's importance rating. Concretely, we identify that the influence changes based on neighboring direction (left vs. right) and a-priori linguistic and computational measures of phrases and collocations (vs. unrelated neighboring words). Our results question whether text-based saliency explanations should be continued to be communicated at word level, and inform future research on alternative saliency explanation methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.02679v2-abstract-full').style.display = 'none'; document.getElementById('2305.02679v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to Findings of ACL 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2304.04478">arXiv:2304.04478</a> <span> [<a href="https://arxiv.org/pdf/2304.04478">pdf</a>, <a href="https://arxiv.org/format/2304.04478">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Oh, Jeez! or Uh-huh? A Listener-aware Backchannel Predictor on ASR Transcriptions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ortega%2C+D">Daniel Ortega</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chia-Yu Li</a>, <a href="/search/cs?searchtype=author&query=Vu%2C+N+T">Ngoc Thang Vu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2304.04478v1-abstract-short" style="display: inline;"> This paper presents our latest investigation on modeling backchannel in conversations. Motivated by a proactive backchanneling theory, we aim at developing a system which acts as a proactive listener by inserting backchannels, such as continuers and assessment, to influence speakers. Our model takes into account not only lexical and acoustic cues, but also introduces the simple and novel idea of u… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.04478v1-abstract-full').style.display = 'inline'; document.getElementById('2304.04478v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2304.04478v1-abstract-full" style="display: none;"> This paper presents our latest investigation on modeling backchannel in conversations. Motivated by a proactive backchanneling theory, we aim at developing a system which acts as a proactive listener by inserting backchannels, such as continuers and assessment, to influence speakers. Our model takes into account not only lexical and acoustic cues, but also introduces the simple and novel idea of using listener embeddings to mimic different backchanneling behaviours. Our experimental results on the Switchboard benchmark dataset reveal that acoustic cues are more important than lexical cues in this task and their combination with listener embeddings works best on both, manual transcriptions and automatically generated transcriptions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.04478v1-abstract-full').style.display = 'none'; document.getElementById('2304.04478v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 April, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published in ICASSP 2020</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2304.04472">arXiv:2304.04472</a> <span> [<a href="https://arxiv.org/pdf/2304.04472">pdf</a>, <a href="https://arxiv.org/format/2304.04472">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Modeling Speaker-Listener Interaction for Backchannel Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ortega%2C+D">Daniel Ortega</a>, <a href="/search/cs?searchtype=author&query=Meyer%2C+S">Sarina Meyer</a>, <a href="/search/cs?searchtype=author&query=Schweitzer%2C+A">Antje Schweitzer</a>, <a href="/search/cs?searchtype=author&query=Vu%2C+N+T">Ngoc Thang Vu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2304.04472v1-abstract-short" style="display: inline;"> We present our latest findings on backchannel modeling novelly motivated by the canonical use of the minimal responses Yeah and Uh-huh in English and their correspondent tokens in German, and the effect of encoding the speaker-listener interaction. Backchanneling theories emphasize the active and continuous role of the listener in the course of the conversation, their effects on the speaker's subs… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.04472v1-abstract-full').style.display = 'inline'; document.getElementById('2304.04472v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2304.04472v1-abstract-full" style="display: none;"> We present our latest findings on backchannel modeling novelly motivated by the canonical use of the minimal responses Yeah and Uh-huh in English and their correspondent tokens in German, and the effect of encoding the speaker-listener interaction. Backchanneling theories emphasize the active and continuous role of the listener in the course of the conversation, their effects on the speaker's subsequent talk, and the consequent dynamic speaker-listener interaction. Therefore, we propose a neural-based acoustic backchannel classifier on minimal responses by processing acoustic features from the speaker speech, capturing and imitating listeners' backchanneling behavior, and encoding speaker-listener interaction. Our experimental results on the Switchboard and GECO datasets reveal that in almost all tested scenarios the speaker or listener behavior embeddings help the model make more accurate backchannel predictions. More importantly, a proper interaction encoding strategy, i.e., combining the speaker and listener embeddings, leads to the best performance on both datasets in terms of F1-score. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.04472v1-abstract-full').style.display = 'none'; document.getElementById('2304.04472v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 April, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published in IWSDS 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2303.16417">arXiv:2303.16417</a> <span> [<a href="https://arxiv.org/pdf/2303.16417">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> </div> </div> <p class="title is-5 mathjax"> Problems and shortcuts in deep learning for screening mammography </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tsue%2C+T">Trevor Tsue</a>, <a href="/search/cs?searchtype=author&query=Mombourquette%2C+B">Brent Mombourquette</a>, <a href="/search/cs?searchtype=author&query=Taha%2C+A">Ahmed Taha</a>, <a href="/search/cs?searchtype=author&query=Matthews%2C+T+P">Thomas Paul Matthews</a>, <a href="/search/cs?searchtype=author&query=Vu%2C+Y+N+T">Yen Nhi Truong Vu</a>, <a href="/search/cs?searchtype=author&query=Su%2C+J">Jason Su</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2303.16417v1-abstract-short" style="display: inline;"> This work reveals undiscovered challenges in the performance and generalizability of deep learning models. We (1) identify spurious shortcuts and evaluation issues that can inflate performance and (2) propose training and analysis methods to address them. We trained an AI model to classify cancer on a retrospective dataset of 120,112 US exams (3,467 cancers) acquired from 2008 to 2017 and 16,693… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.16417v1-abstract-full').style.display = 'inline'; document.getElementById('2303.16417v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2303.16417v1-abstract-full" style="display: none;"> This work reveals undiscovered challenges in the performance and generalizability of deep learning models. We (1) identify spurious shortcuts and evaluation issues that can inflate performance and (2) propose training and analysis methods to address them. We trained an AI model to classify cancer on a retrospective dataset of 120,112 US exams (3,467 cancers) acquired from 2008 to 2017 and 16,693 UK exams (5,655 cancers) acquired from 2011 to 2015. We evaluated on a screening mammography test set of 11,593 US exams (102 cancers; 7,594 women; age 57.1 \pm 11.0) and 1,880 UK exams (590 cancers; 1,745 women; age 63.3 \pm 7.2). A model trained on images of only view markers (no breast) achieved a 0.691 AUC. The original model trained on both datasets achieved a 0.945 AUC on the combined US+UK dataset but paradoxically only 0.838 and 0.892 on the US and UK datasets, respectively. Sampling cancers equally from both datasets during training mitigated this shortcut. A similar AUC paradox (0.903) occurred when evaluating diagnostic exams vs screening exams (0.862 vs 0.861, respectively). Removing diagnostic exams during training alleviated this bias. Finally, the model did not exhibit the AUC paradox over scanner models but still exhibited a bias toward Selenia Dimension (SD) over Hologic Selenia (HS) exams. Analysis showed that this AUC paradox occurred when a dataset attribute had values with a higher cancer prevalence (dataset bias) and the model consequently assigned a higher probability to these attribute values (model bias). Stratification and balancing cancer prevalence can mitigate shortcuts during evaluation. Dataset and model bias can introduce shortcuts and the AUC paradox, potentially pervasive issues within the healthcare AI space. Our methods can verify and mitigate shortcuts while providing a clear understanding of performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.16417v1-abstract-full').style.display = 'none'; document.getElementById('2303.16417v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2303.10227">arXiv:2303.10227</a> <span> [<a href="https://arxiv.org/pdf/2303.10227">pdf</a>, <a href="https://arxiv.org/format/2303.10227">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Conversational Tree Search: A New Hybrid Dialog Task </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=V%C3%A4th%2C+D">Dirk V盲th</a>, <a href="/search/cs?searchtype=author&query=Vanderlyn%2C+L">Lindsey Vanderlyn</a>, <a href="/search/cs?searchtype=author&query=Vu%2C+N+T">Ngoc Thang Vu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2303.10227v1-abstract-short" style="display: inline;"> Conversational interfaces provide a flexible and easy way for users to seek information that may otherwise be difficult or inconvenient to obtain. However, existing interfaces generally fall into one of two categories: FAQs, where users must have a concrete question in order to retrieve a general answer, or dialogs, where users must follow a predefined path but may receive a personalized answer. I… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.10227v1-abstract-full').style.display = 'inline'; document.getElementById('2303.10227v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2303.10227v1-abstract-full" style="display: none;"> Conversational interfaces provide a flexible and easy way for users to seek information that may otherwise be difficult or inconvenient to obtain. However, existing interfaces generally fall into one of two categories: FAQs, where users must have a concrete question in order to retrieve a general answer, or dialogs, where users must follow a predefined path but may receive a personalized answer. In this paper, we introduce Conversational Tree Search (CTS) as a new task that bridges the gap between FAQ-style information retrieval and task-oriented dialog, allowing domain-experts to define dialog trees which can then be converted to an efficient dialog policy that learns only to ask the questions necessary to navigate a user to their goal. We collect a dataset for the travel reimbursement domain and demonstrate a baseline as well as a novel deep Reinforcement Learning architecture for this task. Our results show that the new architecture combines the positive aspects of both the FAQ and dialog system used in the baseline and achieves higher goal completion while skipping unnecessary questions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.10227v1-abstract-full').style.display = 'none'; document.getElementById('2303.10227v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">EACL 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2211.14880">arXiv:2211.14880</a> <span> [<a href="https://arxiv.org/pdf/2211.14880">pdf</a>, <a href="https://arxiv.org/format/2211.14880">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Combining Data Generation and Active Learning for Low-Resource Question Answering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kimmich%2C+M">Maximilian Kimmich</a>, <a href="/search/cs?searchtype=author&query=Bartezzaghi%2C+A">Andrea Bartezzaghi</a>, <a href="/search/cs?searchtype=author&query=Bogojeska%2C+J">Jasmina Bogojeska</a>, <a href="/search/cs?searchtype=author&query=Malossi%2C+C">Cristiano Malossi</a>, <a href="/search/cs?searchtype=author&query=Vu%2C+N+T">Ngoc Thang Vu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2211.14880v2-abstract-short" style="display: inline;"> Neural approaches have become very popular in Question Answering (QA), however, they require a large amount of annotated data. In this work, we propose a novel approach that combines data augmentation via question-answer generation with Active Learning to improve performance in low-resource settings, where the target domains are diverse in terms of difficulty and similarity to the source domain. W… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.14880v2-abstract-full').style.display = 'inline'; document.getElementById('2211.14880v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2211.14880v2-abstract-full" style="display: none;"> Neural approaches have become very popular in Question Answering (QA), however, they require a large amount of annotated data. In this work, we propose a novel approach that combines data augmentation via question-answer generation with Active Learning to improve performance in low-resource settings, where the target domains are diverse in terms of difficulty and similarity to the source domain. We also investigate Active Learning for question answering in different stages, overall reducing the annotation effort of humans. For this purpose, we consider target domains in realistic settings, with an extremely low amount of annotated samples but with many unlabeled documents, which we assume can be obtained with little effort. Additionally, we assume a sufficient amount of labeled data from the source domain being available. We perform extensive experiments to find the best setup for incorporating domain experts. Our findings show that our novel approach, where humans are incorporated in a data generation approach, boosts performance in the low-resource, domain-specific setting, allowing for low-labeling-effort question answering systems in new, specialized domains. They further demonstrate how human annotation affects the performance of QA depending on the stage it is performed. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.14880v2-abstract-full').style.display = 'none'; document.getElementById('2211.14880v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICANN 2024</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.7 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2211.12000">arXiv:2211.12000</a> <span> [<a href="https://arxiv.org/pdf/2211.12000">pdf</a>, <a href="https://arxiv.org/format/2211.12000">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> ArzEn-ST: A Three-way Speech Translation Corpus for Code-Switched Egyptian Arabic - English </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hamed%2C+I">Injy Hamed</a>, <a href="/search/cs?searchtype=author&query=Habash%2C+N">Nizar Habash</a>, <a href="/search/cs?searchtype=author&query=Abdennadher%2C+S">Slim Abdennadher</a>, <a href="/search/cs?searchtype=author&query=Vu%2C+N+T">Ngoc Thang Vu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2211.12000v1-abstract-short" style="display: inline;"> We present our work on collecting ArzEn-ST, a code-switched Egyptian Arabic - English Speech Translation Corpus. This corpus is an extension of the ArzEn speech corpus, which was collected through informal interviews with bilingual speakers. In this work, we collect translations in both directions, monolingual Egyptian Arabic and monolingual English, forming a three-way speech translation corpus.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.12000v1-abstract-full').style.display = 'inline'; document.getElementById('2211.12000v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2211.12000v1-abstract-full" style="display: none;"> We present our work on collecting ArzEn-ST, a code-switched Egyptian Arabic - English Speech Translation Corpus. This corpus is an extension of the ArzEn speech corpus, which was collected through informal interviews with bilingual speakers. In this work, we collect translations in both directions, monolingual Egyptian Arabic and monolingual English, forming a three-way speech translation corpus. We make the translation guidelines and corpus publicly available. We also report results for baseline systems for machine translation and speech translation tasks. We believe this is a valuable resource that can motivate and facilitate further research studying the code-switching phenomenon from a linguistic perspective and can be used to train and evaluate NLP systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.12000v1-abstract-full').style.display = 'none'; document.getElementById('2211.12000v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to the Seventh Arabic Natural Language Processing Workshop (WANLP 2022)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2211.01964">arXiv:2211.01964</a> <span> [<a href="https://arxiv.org/pdf/2211.01964">pdf</a>, <a href="https://arxiv.org/format/2211.01964">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Combining Contrastive and Non-Contrastive Losses for Fine-Tuning Pretrained Models in Speech Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lux%2C+F">Florian Lux</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Ching-Yi Chen</a>, <a href="/search/cs?searchtype=author&query=Vu%2C+N+T">Ngoc Thang Vu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2211.01964v1-abstract-short" style="display: inline;"> Embedding paralinguistic properties is a challenging task as there are only a few hours of training data available for domains such as emotional speech. One solution to this problem is to pretrain a general self-supervised speech representation model on large amounts of unlabeled speech. This pretrained model is then finetuned to a specific task. Paralinguistic properties however have notoriously… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.01964v1-abstract-full').style.display = 'inline'; document.getElementById('2211.01964v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2211.01964v1-abstract-full" style="display: none;"> Embedding paralinguistic properties is a challenging task as there are only a few hours of training data available for domains such as emotional speech. One solution to this problem is to pretrain a general self-supervised speech representation model on large amounts of unlabeled speech. This pretrained model is then finetuned to a specific task. Paralinguistic properties however have notoriously high class variance, making the finetuning ineffective. In this work, we propose a two step approach to this. First we improve the embedding space, then we train an adapter to bridge the gap from the embedding space to a classification task. In order to improve the class invariance we use a combination of contrastive and non-contrastive losses to explicitly optimize for class invariant, yet discriminative features. Our approach consistently outperforms baselines that are finetuned end-to-end on multiple tasks and surpasses a benchmark on state-of-the-art emotion classification. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.01964v1-abstract-full').style.display = 'none'; document.getElementById('2211.01964v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to IEEE SLT 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2210.12223">arXiv:2210.12223</a> <span> [<a href="https://arxiv.org/pdf/2210.12223">pdf</a>, <a href="https://arxiv.org/format/2210.12223">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Low-Resource Multilingual and Zero-Shot Multispeaker TTS </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lux%2C+F">Florian Lux</a>, <a href="/search/cs?searchtype=author&query=Koch%2C+J">Julia Koch</a>, <a href="/search/cs?searchtype=author&query=Vu%2C+N+T">Ngoc Thang Vu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2210.12223v1-abstract-short" style="display: inline;"> While neural methods for text-to-speech (TTS) have shown great advances in modeling multiple speakers, even in zero-shot settings, the amount of data needed for those approaches is generally not feasible for the vast majority of the world's over 6,000 spoken languages. In this work, we bring together the tasks of zero-shot voice cloning and multilingual low-resource TTS. Using the language agnosti… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.12223v1-abstract-full').style.display = 'inline'; document.getElementById('2210.12223v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2210.12223v1-abstract-full" style="display: none;"> While neural methods for text-to-speech (TTS) have shown great advances in modeling multiple speakers, even in zero-shot settings, the amount of data needed for those approaches is generally not feasible for the vast majority of the world's over 6,000 spoken languages. In this work, we bring together the tasks of zero-shot voice cloning and multilingual low-resource TTS. Using the language agnostic meta learning (LAML) procedure and modifications to a TTS encoder, we show that it is possible for a system to learn speaking a new language using just 5 minutes of training data while retaining the ability to infer the voice of even unseen speakers in the newly learned language. We show the success of our proposed approach in terms of intelligibility, naturalness and similarity to target speaker using objective metrics as well as human studies and provide our code and trained models open source. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.12223v1-abstract-full').style.display = 'none'; document.getElementById('2210.12223v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to AACL 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2210.11642">arXiv:2210.11642</a> <span> [<a href="https://arxiv.org/pdf/2210.11642">pdf</a>, <a href="https://arxiv.org/format/2210.11642">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Improving Semi-supervised End-to-end Automatic Speech Recognition using CycleGAN and Inter-domain Losses </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+C">Chia-Yu Li</a>, <a href="/search/cs?searchtype=author&query=Vu%2C+N+T">Ngoc Thang Vu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2210.11642v1-abstract-short" style="display: inline;"> We propose a novel method that combines CycleGAN and inter-domain losses for semi-supervised end-to-end automatic speech recognition. Inter-domain loss targets the extraction of an intermediate shared representation of speech and text inputs using a shared network. CycleGAN uses cycle-consistent loss and the identity mapping loss to preserve relevant characteristics of the input feature after conv… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.11642v1-abstract-full').style.display = 'inline'; document.getElementById('2210.11642v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2210.11642v1-abstract-full" style="display: none;"> We propose a novel method that combines CycleGAN and inter-domain losses for semi-supervised end-to-end automatic speech recognition. Inter-domain loss targets the extraction of an intermediate shared representation of speech and text inputs using a shared network. CycleGAN uses cycle-consistent loss and the identity mapping loss to preserve relevant characteristics of the input feature after converting from one domain to another. As such, both approaches are suitable to train end-to-end models on unpaired speech-text inputs. In this paper, we exploit the advantages from both inter-domain loss and CycleGAN to achieve better shared representation of unpaired speech and text inputs and thus improve the speech-to-text mapping. Our experimental results on the WSJ eval92 and Voxforge (non English) show 8~8.5% character error rate reduction over the baseline, and the results on LibriSpeech test_clean also show noticeable improvement. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.11642v1-abstract-full').style.display = 'none'; document.getElementById('2210.11642v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">6 pages + 2 references, 6 figures, accepted by SLT2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2210.07126">arXiv:2210.07126</a> <span> [<a href="https://arxiv.org/pdf/2210.07126">pdf</a>, <a href="https://arxiv.org/format/2210.07126">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Challenges in Explanation Quality Evaluation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Schuff%2C+H">Hendrik Schuff</a>, <a href="/search/cs?searchtype=author&query=Adel%2C+H">Heike Adel</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+P">Peng Qi</a>, <a href="/search/cs?searchtype=author&query=Vu%2C+N+T">Ngoc Thang Vu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2210.07126v2-abstract-short" style="display: inline;"> While much research focused on producing explanations, it is still unclear how the produced explanations' quality can be evaluated in a meaningful way. Today's predominant approach is to quantify explanations using proxy scores which compare explanations to (human-annotated) gold explanations. This approach assumes that explanations which reach higher proxy scores will also provide a greater benef… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.07126v2-abstract-full').style.display = 'inline'; document.getElementById('2210.07126v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2210.07126v2-abstract-full" style="display: none;"> While much research focused on producing explanations, it is still unclear how the produced explanations' quality can be evaluated in a meaningful way. Today's predominant approach is to quantify explanations using proxy scores which compare explanations to (human-annotated) gold explanations. This approach assumes that explanations which reach higher proxy scores will also provide a greater benefit to human users. In this paper, we present problems of this approach. Concretely, we (i) formulate desired characteristics of explanation quality, (ii) describe how current evaluation practices violate them, and (iii) support our argumentation with initial evidence from a crowdsourcing case study in which we investigate the explanation quality of state-of-the-art explainable question answering systems. We find that proxy scores correlate poorly with human quality ratings and, additionally, become less expressive the more often they are used (i.e. following Goodhart's law). Finally, we propose guidelines to enable a meaningful evaluation of explanations to drive the development of systems that provide tangible benefits to human users. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.07126v2-abstract-full').style.display = 'none'; document.getElementById('2210.07126v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">41 pages, 11 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2210.07002">arXiv:2210.07002</a> <span> [<a href="https://arxiv.org/pdf/2210.07002">pdf</a>, <a href="https://arxiv.org/format/2210.07002">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Anonymizing Speech with Generative Adversarial Networks to Preserve Speaker Privacy </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Meyer%2C+S">Sarina Meyer</a>, <a href="/search/cs?searchtype=author&query=Tilli%2C+P">Pascal Tilli</a>, <a href="/search/cs?searchtype=author&query=Denisov%2C+P">Pavel Denisov</a>, <a href="/search/cs?searchtype=author&query=Lux%2C+F">Florian Lux</a>, <a href="/search/cs?searchtype=author&query=Koch%2C+J">Julia Koch</a>, <a href="/search/cs?searchtype=author&query=Vu%2C+N+T">Ngoc Thang Vu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2210.07002v3-abstract-short" style="display: inline;"> In order to protect the privacy of speech data, speaker anonymization aims for hiding the identity of a speaker by changing the voice in speech recordings. This typically comes with a privacy-utility trade-off between protection of individuals and usability of the data for downstream applications. One of the challenges in this context is to create non-existent voices that sound as natural as possi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.07002v3-abstract-full').style.display = 'inline'; document.getElementById('2210.07002v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2210.07002v3-abstract-full" style="display: none;"> In order to protect the privacy of speech data, speaker anonymization aims for hiding the identity of a speaker by changing the voice in speech recordings. This typically comes with a privacy-utility trade-off between protection of individuals and usability of the data for downstream applications. One of the challenges in this context is to create non-existent voices that sound as natural as possible. In this work, we propose to tackle this issue by generating speaker embeddings using a generative adversarial network with Wasserstein distance as cost function. By incorporating these artificial embeddings into a speech-to-text-to-speech pipeline, we outperform previous approaches in terms of privacy and utility. According to standard objective metrics and human evaluation, our approach generates intelligible and content-preserving yet privacy-protecting versions of the original recordings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.07002v3-abstract-full').style.display = 'none'; document.getElementById('2210.07002v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">IEEE Spoken Language Technology Workshop 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2210.06990">arXiv:2210.06990</a> <span> [<a href="https://arxiv.org/pdf/2210.06990">pdf</a>, <a href="https://arxiv.org/format/2210.06990">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Exploring Segmentation Approaches for Neural Machine Translation of Code-Switched Egyptian Arabic-English Text </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gaser%2C+M">Marwa Gaser</a>, <a href="/search/cs?searchtype=author&query=Mager%2C+M">Manuel Mager</a>, <a href="/search/cs?searchtype=author&query=Hamed%2C+I">Injy Hamed</a>, <a href="/search/cs?searchtype=author&query=Habash%2C+N">Nizar Habash</a>, <a href="/search/cs?searchtype=author&query=Abdennadher%2C+S">Slim Abdennadher</a>, <a href="/search/cs?searchtype=author&query=Vu%2C+N+T">Ngoc Thang Vu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2210.06990v3-abstract-short" style="display: inline;"> Data sparsity is one of the main challenges posed by code-switching (CS), which is further exacerbated in the case of morphologically rich languages. For the task of machine translation (MT), morphological segmentation has proven successful in alleviating data sparsity in monolingual contexts; however, it has not been investigated for CS settings. In this paper, we study the effectiveness of diffe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.06990v3-abstract-full').style.display = 'inline'; document.getElementById('2210.06990v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2210.06990v3-abstract-full" style="display: none;"> Data sparsity is one of the main challenges posed by code-switching (CS), which is further exacerbated in the case of morphologically rich languages. For the task of machine translation (MT), morphological segmentation has proven successful in alleviating data sparsity in monolingual contexts; however, it has not been investigated for CS settings. In this paper, we study the effectiveness of different segmentation approaches on MT performance, covering morphology-based and frequency-based segmentation techniques. We experiment on MT from code-switched Arabic-English to English. We provide detailed analysis, examining a variety of conditions, such as data size and sentences with different degrees of CS. Empirical results show that morphology-aware segmenters perform the best in segmentation tasks but under-perform in MT. Nevertheless, we find that the choice of the segmentation setup to use for MT is highly dependent on the data size. For extreme low-resource scenarios, a combination of frequency and morphology-based segmentations is shown to perform the best. For more resourced settings, such a combination does not bring significant improvements over the use of frequency-based segmentation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.06990v3-abstract-full').style.display = 'none'; document.getElementById('2210.06990v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 April, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to EACL 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2208.06066">arXiv:2208.06066</a> <span> [<a href="https://arxiv.org/pdf/2208.06066">pdf</a>, <a href="https://arxiv.org/format/2208.06066">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Deep is a Luxury We Don't Have </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Taha%2C+A">Ahmed Taha</a>, <a href="/search/cs?searchtype=author&query=Vu%2C+Y+N+T">Yen Nhi Truong Vu</a>, <a href="/search/cs?searchtype=author&query=Mombourquette%2C+B">Brent Mombourquette</a>, <a href="/search/cs?searchtype=author&query=Matthews%2C+T+P">Thomas Paul Matthews</a>, <a href="/search/cs?searchtype=author&query=Su%2C+J">Jason Su</a>, <a href="/search/cs?searchtype=author&query=Singh%2C+S">Sadanand Singh</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2208.06066v1-abstract-short" style="display: inline;"> Medical images come in high resolutions. A high resolution is vital for finding malignant tissues at an early stage. Yet, this resolution presents a challenge in terms of modeling long range dependencies. Shallow transformers eliminate this problem, but they suffer from quadratic complexity. In this paper, we tackle this complexity by leveraging a linear self-attention approximation. Through this… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2208.06066v1-abstract-full').style.display = 'inline'; document.getElementById('2208.06066v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2208.06066v1-abstract-full" style="display: none;"> Medical images come in high resolutions. A high resolution is vital for finding malignant tissues at an early stage. Yet, this resolution presents a challenge in terms of modeling long range dependencies. Shallow transformers eliminate this problem, but they suffer from quadratic complexity. In this paper, we tackle this complexity by leveraging a linear self-attention approximation. Through this approximation, we propose an efficient vision model called HCT that stands for High resolution Convolutional Transformer. HCT brings transformers' merits to high resolution images at a significantly lower cost. We evaluate HCT using a high resolution mammography dataset. HCT is significantly superior to its CNN counterpart. Furthermore, we demonstrate HCT's fitness for medical images by evaluating its effective receptive field.Code available at https://bit.ly/3ykBhhf <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2208.06066v1-abstract-full').style.display = 'none'; document.getElementById('2208.06066v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 August, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">MICCAI 2022 + Extra Experiments</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2208.00433">arXiv:2208.00433</a> <span> [<a href="https://arxiv.org/pdf/2208.00433">pdf</a>, <a href="https://arxiv.org/format/2208.00433">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> The Who in Code-Switching: A Case Study for Predicting Egyptian Arabic-English Code-Switching Levels based on Character Profiles </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hamed%2C+I">Injy Hamed</a>, <a href="/search/cs?searchtype=author&query=Bolock%2C+A+E">Alia El Bolock</a>, <a href="/search/cs?searchtype=author&query=Herbert%2C+C">Cornelia Herbert</a>, <a href="/search/cs?searchtype=author&query=Abdennadher%2C+S">Slim Abdennadher</a>, <a href="/search/cs?searchtype=author&query=Vu%2C+N+T">Ngoc Thang Vu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2208.00433v1-abstract-short" style="display: inline;"> Code-switching (CS) is a common linguistic phenomenon exhibited by multilingual individuals, where they tend to alternate between languages within one single conversation. CS is a complex phenomenon that not only encompasses linguistic challenges, but also contains a great deal of complexity in terms of its dynamic behaviour across speakers. Given that the factors giving rise to CS vary from one c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2208.00433v1-abstract-full').style.display = 'inline'; document.getElementById('2208.00433v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2208.00433v1-abstract-full" style="display: none;"> Code-switching (CS) is a common linguistic phenomenon exhibited by multilingual individuals, where they tend to alternate between languages within one single conversation. CS is a complex phenomenon that not only encompasses linguistic challenges, but also contains a great deal of complexity in terms of its dynamic behaviour across speakers. Given that the factors giving rise to CS vary from one country to the other, as well as from one person to the other, CS is found to be a speaker-dependant behaviour, where the frequency by which the foreign language is embedded differs across speakers. While several researchers have looked into predicting CS behaviour from a linguistic point of view, research is still lacking in the task of predicting user CS behaviour from sociological and psychological perspectives. We provide an empirical user study, where we investigate the correlations between users' CS levels and character traits. We conduct interviews with bilinguals and gather information on their profiles, including their demographics, personality traits, and traveling experiences. We then use machine learning (ML) to predict users' CS levels based on their profiles, where we identify the main influential factors in the modeling process. We experiment with both classification as well as regression tasks. Our results show that the CS behaviour is affected by the relation between speakers, travel experiences as well as Neuroticism and Extraversion personality traits. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2208.00433v1-abstract-full').style.display = 'none'; document.getElementById('2208.00433v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 July, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To be published in the International Journal of Asian Language Processing. arXiv admin note: substantial text overlap with arXiv:2112.06462</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2207.05549">arXiv:2207.05549</a> <span> [<a href="https://arxiv.org/pdf/2207.05549">pdf</a>, <a href="https://arxiv.org/format/2207.05549">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> PoeticTTS -- Controllable Poetry Reading for Literary Studies </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Koch%2C+J">Julia Koch</a>, <a href="/search/cs?searchtype=author&query=Lux%2C+F">Florian Lux</a>, <a href="/search/cs?searchtype=author&query=Schauffler%2C+N">Nadja Schauffler</a>, <a href="/search/cs?searchtype=author&query=Bernhart%2C+T">Toni Bernhart</a>, <a href="/search/cs?searchtype=author&query=Dieterle%2C+F">Felix Dieterle</a>, <a href="/search/cs?searchtype=author&query=Kuhn%2C+J">Jonas Kuhn</a>, <a href="/search/cs?searchtype=author&query=Richter%2C+S">Sandra Richter</a>, <a href="/search/cs?searchtype=author&query=Viehhauser%2C+G">Gabriel Viehhauser</a>, <a href="/search/cs?searchtype=author&query=Vu%2C+N+T">Ngoc Thang Vu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2207.05549v2-abstract-short" style="display: inline;"> Speech synthesis for poetry is challenging due to specific intonation patterns inherent to poetic speech. In this work, we propose an approach to synthesise poems with almost human like naturalness in order to enable literary scholars to systematically examine hypotheses on the interplay between text, spoken realisation, and the listener's perception of poems. To meet these special requirements fo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2207.05549v2-abstract-full').style.display = 'inline'; document.getElementById('2207.05549v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2207.05549v2-abstract-full" style="display: none;"> Speech synthesis for poetry is challenging due to specific intonation patterns inherent to poetic speech. In this work, we propose an approach to synthesise poems with almost human like naturalness in order to enable literary scholars to systematically examine hypotheses on the interplay between text, spoken realisation, and the listener's perception of poems. To meet these special requirements for literary studies, we resynthesise poems by cloning prosodic values from a human reference recitation, and afterwards make use of fine-grained prosody control to manipulate the synthetic speech in a human-in-the-loop setting to alter the recitation w.r.t. specific phenomena. We find that finetuning our TTS model on poetry captures poetic intonation patterns to a large extent which is beneficial for prosody cloning and manipulation and verify the success of our approach both in an objective evaluation as well as in human studies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2207.05549v2-abstract-full').style.display = 'none'; document.getElementById('2207.05549v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 July, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Presented at Interspeech 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2207.04834">arXiv:2207.04834</a> <span> [<a href="https://arxiv.org/pdf/2207.04834">pdf</a>, <a href="https://arxiv.org/format/2207.04834">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Speaker Anonymization with Phonetic Intermediate Representations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Meyer%2C+S">Sarina Meyer</a>, <a href="/search/cs?searchtype=author&query=Lux%2C+F">Florian Lux</a>, <a href="/search/cs?searchtype=author&query=Denisov%2C+P">Pavel Denisov</a>, <a href="/search/cs?searchtype=author&query=Koch%2C+J">Julia Koch</a>, <a href="/search/cs?searchtype=author&query=Tilli%2C+P">Pascal Tilli</a>, <a href="/search/cs?searchtype=author&query=Vu%2C+N+T">Ngoc Thang Vu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2207.04834v1-abstract-short" style="display: inline;"> In this work, we propose a speaker anonymization pipeline that leverages high quality automatic speech recognition and synthesis systems to generate speech conditioned on phonetic transcriptions and anonymized speaker embeddings. Using phones as the intermediate representation ensures near complete elimination of speaker identity information from the input while preserving the original phonetic co… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2207.04834v1-abstract-full').style.display = 'inline'; document.getElementById('2207.04834v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2207.04834v1-abstract-full" style="display: none;"> In this work, we propose a speaker anonymization pipeline that leverages high quality automatic speech recognition and synthesis systems to generate speech conditioned on phonetic transcriptions and anonymized speaker embeddings. Using phones as the intermediate representation ensures near complete elimination of speaker identity information from the input while preserving the original phonetic content as much as possible. Our experimental results on LibriSpeech and VCTK corpora reveal two key findings: 1) although automatic speech recognition produces imperfect transcriptions, our neural speech synthesis system can handle such errors, making our system feasible and robust, and 2) combining speaker embeddings from different resources is beneficial and their appropriate normalization is crucial. Overall, our final best system outperforms significantly the baselines provided in the Voice Privacy Challenge 2020 in terms of privacy robustness against a lazy-informed attacker while maintaining high intelligibility and naturalness of the anonymized speech. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2207.04834v1-abstract-full').style.display = 'none'; document.getElementById('2207.04834v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 July, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at Interspeech 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2206.12229">arXiv:2206.12229</a> <span> [<a href="https://arxiv.org/pdf/2206.12229">pdf</a>, <a href="https://arxiv.org/format/2206.12229">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Exact Prosody Cloning in Zero-Shot Multispeaker Text-to-Speech </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lux%2C+F">Florian Lux</a>, <a href="/search/cs?searchtype=author&query=Koch%2C+J">Julia Koch</a>, <a href="/search/cs?searchtype=author&query=Vu%2C+N+T">Ngoc Thang Vu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2206.12229v2-abstract-short" style="display: inline;"> The cloning of a speaker's voice using an untranscribed reference sample is one of the great advances of modern neural text-to-speech (TTS) methods. Approaches for mimicking the prosody of a transcribed reference audio have also been proposed recently. In this work, we bring these two tasks together for the first time through utterance level normalization in conjunction with an utterance level spe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.12229v2-abstract-full').style.display = 'inline'; document.getElementById('2206.12229v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2206.12229v2-abstract-full" style="display: none;"> The cloning of a speaker's voice using an untranscribed reference sample is one of the great advances of modern neural text-to-speech (TTS) methods. Approaches for mimicking the prosody of a transcribed reference audio have also been proposed recently. In this work, we bring these two tasks together for the first time through utterance level normalization in conjunction with an utterance level speaker embedding. We further introduce a lightweight aligner for extracting fine-grained prosodic features, that can be finetuned on individual samples within seconds. We show that it is possible to clone the voice of a speaker as well as the prosody of a spoken reference independently without any degradation in quality and high similarity to both original voice and prosody, as our objective evaluation and human study show. All of our code and trained models are available, alongside static and interactive demos. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.12229v2-abstract-full').style.display = 'none'; document.getElementById('2206.12229v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 June, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to IEEE SLT 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2205.12649">arXiv:2205.12649</a> <span> [<a href="https://arxiv.org/pdf/2205.12649">pdf</a>, <a href="https://arxiv.org/format/2205.12649">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Investigating Lexical Replacements for Arabic-English Code-Switched Data Augmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hamed%2C+I">Injy Hamed</a>, <a href="/search/cs?searchtype=author&query=Habash%2C+N">Nizar Habash</a>, <a href="/search/cs?searchtype=author&query=Abdennadher%2C+S">Slim Abdennadher</a>, <a href="/search/cs?searchtype=author&query=Vu%2C+N+T">Ngoc Thang Vu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2205.12649v2-abstract-short" style="display: inline;"> Data sparsity is a main problem hindering the development of code-switching (CS) NLP systems. In this paper, we investigate data augmentation techniques for synthesizing dialectal Arabic-English CS text. We perform lexical replacements using word-aligned parallel corpora where CS points are either randomly chosen or learnt using a sequence-to-sequence model. We compare these approaches against dic… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2205.12649v2-abstract-full').style.display = 'inline'; document.getElementById('2205.12649v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2205.12649v2-abstract-full" style="display: none;"> Data sparsity is a main problem hindering the development of code-switching (CS) NLP systems. In this paper, we investigate data augmentation techniques for synthesizing dialectal Arabic-English CS text. We perform lexical replacements using word-aligned parallel corpora where CS points are either randomly chosen or learnt using a sequence-to-sequence model. We compare these approaches against dictionary-based replacements. We assess the quality of the generated sentences through human evaluation and evaluate the effectiveness of data augmentation on machine translation (MT), automatic speech recognition (ASR), and speech translation (ST) tasks. Results show that using a predictive model results in more natural CS sentences compared to the random approach, as reported in human judgements. In the downstream tasks, despite the random approach generating more data, both approaches perform equally (outperforming dictionary-based replacements). Overall, data augmentation achieves 34% improvement in perplexity, 5.2% relative improvement on WER for ASR task, +4.0-5.1 BLEU points on MT task, and +2.1-2.2 BLEU points on ST over a baseline trained on available data without augmentation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2205.12649v2-abstract-full').style.display = 'none'; document.getElementById('2205.12649v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 April, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 May, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to LoResMT 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2205.01500">arXiv:2205.01500</a> <span> [<a href="https://arxiv.org/pdf/2205.01500">pdf</a>, <a href="https://arxiv.org/format/2205.01500">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Meta Learning for Natural Language Processing: A Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lee%2C+H">Hung-yi Lee</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shang-Wen Li</a>, <a href="/search/cs?searchtype=author&query=Vu%2C+N+T">Ngoc Thang Vu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2205.01500v2-abstract-short" style="display: inline;"> Deep learning has been the mainstream technique in natural language processing (NLP) area. However, the techniques require many labeled data and are less generalizable across domains. Meta-learning is an arising field in machine learning studying approaches to learn better learning algorithms. Approaches aim at improving algorithms in various aspects, including data efficiency and generalizability… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2205.01500v2-abstract-full').style.display = 'inline'; document.getElementById('2205.01500v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2205.01500v2-abstract-full" style="display: none;"> Deep learning has been the mainstream technique in natural language processing (NLP) area. However, the techniques require many labeled data and are less generalizable across domains. Meta-learning is an arising field in machine learning studying approaches to learn better learning algorithms. Approaches aim at improving algorithms in various aspects, including data efficiency and generalizability. Efficacy of approaches has been shown in many NLP tasks, but there is no systematic survey of these approaches in NLP, which hinders more researchers from joining the field. Our goal with this survey paper is to offer researchers pointers to relevant meta-learning works in NLP and attract more attention from the NLP community to drive future innovation. This paper first introduces the general concepts of meta-learning and the common approaches. Then we summarize task construction settings and application of meta-learning for various NLP problems and review the development of meta-learning in NLP community. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2205.01500v2-abstract-full').style.display = 'none'; document.getElementById('2205.01500v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 July, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 May, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by NAACL 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2204.06671">arXiv:2204.06671</a> <span> [<a href="https://arxiv.org/pdf/2204.06671">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> A deep learning algorithm for reducing false positives in screening mammography </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Pedemonte%2C+S">Stefano Pedemonte</a>, <a href="/search/cs?searchtype=author&query=Tsue%2C+T">Trevor Tsue</a>, <a href="/search/cs?searchtype=author&query=Mombourquette%2C+B">Brent Mombourquette</a>, <a href="/search/cs?searchtype=author&query=Vu%2C+Y+N+T">Yen Nhi Truong Vu</a>, <a href="/search/cs?searchtype=author&query=Matthews%2C+T">Thomas Matthews</a>, <a href="/search/cs?searchtype=author&query=Hoil%2C+R+M">Rodrigo Morales Hoil</a>, <a href="/search/cs?searchtype=author&query=Shah%2C+M">Meet Shah</a>, <a href="/search/cs?searchtype=author&query=Ghare%2C+N">Nikita Ghare</a>, <a href="/search/cs?searchtype=author&query=Zingman-Daniels%2C+N">Naomi Zingman-Daniels</a>, <a href="/search/cs?searchtype=author&query=Holley%2C+S">Susan Holley</a>, <a href="/search/cs?searchtype=author&query=Appleton%2C+C+M">Catherine M. Appleton</a>, <a href="/search/cs?searchtype=author&query=Su%2C+J">Jason Su</a>, <a href="/search/cs?searchtype=author&query=Wahl%2C+R+L">Richard L. Wahl</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2204.06671v1-abstract-short" style="display: inline;"> Screening mammography improves breast cancer outcomes by enabling early detection and treatment. However, false positive callbacks for additional imaging from screening exams cause unnecessary procedures, patient anxiety, and financial burden. This work demonstrates an AI algorithm that reduces false positives by identifying mammograms not suspicious for breast cancer. We trained the algorithm to… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2204.06671v1-abstract-full').style.display = 'inline'; document.getElementById('2204.06671v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2204.06671v1-abstract-full" style="display: none;"> Screening mammography improves breast cancer outcomes by enabling early detection and treatment. However, false positive callbacks for additional imaging from screening exams cause unnecessary procedures, patient anxiety, and financial burden. This work demonstrates an AI algorithm that reduces false positives by identifying mammograms not suspicious for breast cancer. We trained the algorithm to determine the absence of cancer using 123,248 2D digital mammograms (6,161 cancers) and performed a retrospective study on 14,831 screening exams (1,026 cancers) from 15 US and 3 UK sites. Retrospective evaluation of the algorithm on the largest of the US sites (11,592 mammograms, 101 cancers) a) left the cancer detection rate unaffected (p=0.02, non-inferiority margin 0.25 cancers per 1000 exams), b) reduced callbacks for diagnostic exams by 31.1% compared to standard clinical readings, c) reduced benign needle biopsies by 7.4%, and d) reduced screening exams requiring radiologist interpretation by 41.6% in the simulated clinical workflow. This work lays the foundation for semi-autonomous breast cancer screening systems that could benefit patients and healthcare systems by reducing false positives, unnecessary procedures, patient anxiety, and expenses. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2204.06671v1-abstract-full').style.display = 'none'; document.getElementById('2204.06671v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 April, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2203.08954">arXiv:2203.08954</a> <span> [<a href="https://arxiv.org/pdf/2203.08954">pdf</a>, <a href="https://arxiv.org/format/2203.08954">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> BPE vs. Morphological Segmentation: A Case Study on Machine Translation of Four Polysynthetic Languages </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mager%2C+M">Manuel Mager</a>, <a href="/search/cs?searchtype=author&query=Oncevay%2C+A">Arturo Oncevay</a>, <a href="/search/cs?searchtype=author&query=Mager%2C+E">Elisabeth Mager</a>, <a href="/search/cs?searchtype=author&query=Kann%2C+K">Katharina Kann</a>, <a href="/search/cs?searchtype=author&query=Vu%2C+N+T">Ngoc Thang Vu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2203.08954v1-abstract-short" style="display: inline;"> Morphologically-rich polysynthetic languages present a challenge for NLP systems due to data sparsity, and a common strategy to handle this issue is to apply subword segmentation. We investigate a wide variety of supervised and unsupervised morphological segmentation methods for four polysynthetic languages: Nahuatl, Raramuri, Shipibo-Konibo, and Wixarika. Then, we compare the morphologically insp… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.08954v1-abstract-full').style.display = 'inline'; document.getElementById('2203.08954v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2203.08954v1-abstract-full" style="display: none;"> Morphologically-rich polysynthetic languages present a challenge for NLP systems due to data sparsity, and a common strategy to handle this issue is to apply subword segmentation. We investigate a wide variety of supervised and unsupervised morphological segmentation methods for four polysynthetic languages: Nahuatl, Raramuri, Shipibo-Konibo, and Wixarika. Then, we compare the morphologically inspired segmentation methods against Byte-Pair Encodings (BPEs) as inputs for machine translation (MT) when translating to and from Spanish. We show that for all language pairs except for Nahuatl, an unsupervised morphological segmentation algorithm outperforms BPEs consistently and that, although supervised methods achieve better segmentation scores, they under-perform in MT challenges. Finally, we contribute two new morphological segmentation datasets for Raramuri and Shipibo-Konibo, and a parallel corpus for Raramuri--Spanish. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.08954v1-abstract-full').style.display = 'none'; document.getElementById('2203.08954v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 March, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to Findings of ACL 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2203.03191">arXiv:2203.03191</a> <span> [<a href="https://arxiv.org/pdf/2203.03191">pdf</a>, <a href="https://arxiv.org/format/2203.03191">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Language-Agnostic Meta-Learning for Low-Resource Text-to-Speech with Articulatory Features </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lux%2C+F">Florian Lux</a>, <a href="/search/cs?searchtype=author&query=Vu%2C+N+T">Ngoc Thang Vu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2203.03191v1-abstract-short" style="display: inline;"> While neural text-to-speech systems perform remarkably well in high-resource scenarios, they cannot be applied to the majority of the over 6,000 spoken languages in the world due to a lack of appropriate training data. In this work, we use embeddings derived from articulatory vectors rather than embeddings derived from phoneme identities to learn phoneme representations that hold across languages.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.03191v1-abstract-full').style.display = 'inline'; document.getElementById('2203.03191v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2203.03191v1-abstract-full" style="display: none;"> While neural text-to-speech systems perform remarkably well in high-resource scenarios, they cannot be applied to the majority of the over 6,000 spoken languages in the world due to a lack of appropriate training data. In this work, we use embeddings derived from articulatory vectors rather than embeddings derived from phoneme identities to learn phoneme representations that hold across languages. In conjunction with language agnostic meta learning, this enables us to fine-tune a high-quality text-to-speech model on just 30 minutes of data in a previously unseen language spoken by a previously unseen speaker. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.03191v1-abstract-full').style.display = 'none'; document.getElementById('2203.03191v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 March, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted for the ACL 2022 main conference</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2201.11569">arXiv:2201.11569</a> <span> [<a href="https://arxiv.org/pdf/2201.11569">pdf</a>, <a href="https://arxiv.org/format/2201.11569">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3531146.3533127">10.1145/3531146.3533127 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Human Interpretation of Saliency-based Explanation Over Text </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Schuff%2C+H">Hendrik Schuff</a>, <a href="/search/cs?searchtype=author&query=Jacovi%2C+A">Alon Jacovi</a>, <a href="/search/cs?searchtype=author&query=Adel%2C+H">Heike Adel</a>, <a href="/search/cs?searchtype=author&query=Goldberg%2C+Y">Yoav Goldberg</a>, <a href="/search/cs?searchtype=author&query=Vu%2C+N+T">Ngoc Thang Vu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2201.11569v2-abstract-short" style="display: inline;"> While a lot of research in explainable AI focuses on producing effective explanations, less work is devoted to the question of how people understand and interpret the explanation. In this work, we focus on this question through a study of saliency-based explanations over textual data. Feature-attribution explanations of text models aim to communicate which parts of the input text were more influen… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2201.11569v2-abstract-full').style.display = 'inline'; document.getElementById('2201.11569v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2201.11569v2-abstract-full" style="display: none;"> While a lot of research in explainable AI focuses on producing effective explanations, less work is devoted to the question of how people understand and interpret the explanation. In this work, we focus on this question through a study of saliency-based explanations over textual data. Feature-attribution explanations of text models aim to communicate which parts of the input text were more influential than others towards the model decision. Many current explanation methods, such as gradient-based or Shapley value-based methods, provide measures of importance which are well-understood mathematically. But how does a person receiving the explanation (the explainee) comprehend it? And does their understanding match what the explanation attempted to communicate? We empirically investigate the effect of various factors of the input, the feature-attribution explanation, and visualization procedure, on laypeople's interpretation of the explanation. We query crowdworkers for their interpretation on tasks in English and German, and fit a GAMM model to their responses considering the factors of interest. We find that people often mis-interpret the explanations: superficial and unrelated factors, such as word length, influence the explainees' importance assignment despite the explanation communicating importance directly. We then show that some of this distortion can be attenuated: we propose a method to adjust saliencies based on model estimates of over- and under-perception, and explore bar charts as an alternative to heatmap saliency visualization. We find that both approaches can attenuate the distorting effect of specific factors, leading to better-calibrated understanding of the explanation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2201.11569v2-abstract-full').style.display = 'none'; document.getElementById('2201.11569v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 June, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 January, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">FAccT 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2112.10202">arXiv:2112.10202</a> <span> [<a href="https://arxiv.org/pdf/2112.10202">pdf</a>, <a href="https://arxiv.org/ps/2112.10202">ps</a>, <a href="https://arxiv.org/format/2112.10202">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Integrating Knowledge in End-to-End Automatic Speech Recognition for Mandarin-English Code-Switching </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+C">Chia-Yu Li</a>, <a href="/search/cs?searchtype=author&query=Vu%2C+N+T">Ngoc Thang Vu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2112.10202v1-abstract-short" style="display: inline;"> Code-Switching (CS) is a common linguistic phenomenon in multilingual communities that consists of switching between languages while speaking. This paper presents our investigations on end-to-end speech recognition for Mandarin-English CS speech. We analyse different CS specific issues such as the properties mismatches between languages in a CS language pair, the unpredictable nature of switching… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2112.10202v1-abstract-full').style.display = 'inline'; document.getElementById('2112.10202v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2112.10202v1-abstract-full" style="display: none;"> Code-Switching (CS) is a common linguistic phenomenon in multilingual communities that consists of switching between languages while speaking. This paper presents our investigations on end-to-end speech recognition for Mandarin-English CS speech. We analyse different CS specific issues such as the properties mismatches between languages in a CS language pair, the unpredictable nature of switching points, and the data scarcity problem. We exploit and improve the state-of-the-art end-to-end system by merging nonlinguistic symbols, by integrating language identification using hierarchical softmax, by modeling sub-word units, by artificially lowering the speaking rate, and by augmenting data using speed perturbed technique and several monolingual datasets to improve the final performance not only on CS speech but also on monolingual benchmarks in order to make the system more applicable on real life settings. Finally, we explore the effect of different language model integration methods on the performance of the proposed model. Our experimental results reveal that all the proposed techniques improve the recognition performance. The best combined system improves the baseline system by up to 35% relatively in terms of mixed error rate and delivers acceptable performance on monolingual benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2112.10202v1-abstract-full').style.display = 'none'; document.getElementById('2112.10202v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 December, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The 2019 International Conference on Asian Language Processing (IALP)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2112.10108">arXiv:2112.10108</a> <span> [<a href="https://arxiv.org/pdf/2112.10108">pdf</a>, <a href="https://arxiv.org/format/2112.10108">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Investigation of Densely Connected Convolutional Networks with Domain Adversarial Learning for Noise Robust Speech Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+C+Y">Chia Yu Li</a>, <a href="/search/cs?searchtype=author&query=Vu%2C+N+T">Ngoc Thang Vu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2112.10108v1-abstract-short" style="display: inline;"> We investigate densely connected convolutional networks (DenseNets) and their extension with domain adversarial training for noise robust speech recognition. DenseNets are very deep, compact convolutional neural networks which have demonstrated incredible improvements over the state-of-the-art results in computer vision. Our experimental results reveal that DenseNets are more robust against noise… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2112.10108v1-abstract-full').style.display = 'inline'; document.getElementById('2112.10108v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2112.10108v1-abstract-full" style="display: none;"> We investigate densely connected convolutional networks (DenseNets) and their extension with domain adversarial training for noise robust speech recognition. DenseNets are very deep, compact convolutional neural networks which have demonstrated incredible improvements over the state-of-the-art results in computer vision. Our experimental results reveal that DenseNets are more robust against noise than other neural network based models such as deep feed forward neural networks and convolutional neural networks. Moreover, domain adversarial learning can further improve the robustness of DenseNets against both, known and unknown noise conditions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2112.10108v1-abstract-full').style.display = 'none'; document.getElementById('2112.10108v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 December, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages, 5 figures, The 30th Conference on Electronic Speech Signal Processing (ESSV2019)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2112.06462">arXiv:2112.06462</a> <span> [<a href="https://arxiv.org/pdf/2112.06462">pdf</a>, <a href="https://arxiv.org/format/2112.06462">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Predicting User Code-Switching Level from Sociological and Psychological Profiles </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hamed%2C+I">Injy Hamed</a>, <a href="/search/cs?searchtype=author&query=Bolock%2C+A+E">Alia El Bolock</a>, <a href="/search/cs?searchtype=author&query=Rizk%2C+N">Nader Rizk</a>, <a href="/search/cs?searchtype=author&query=Herbert%2C+C">Cornelia Herbert</a>, <a href="/search/cs?searchtype=author&query=Abdennadher%2C+S">Slim Abdennadher</a>, <a href="/search/cs?searchtype=author&query=Vu%2C+N+T">Ngoc Thang Vu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2112.06462v1-abstract-short" style="display: inline;"> Multilingual speakers tend to alternate between languages within a conversation, a phenomenon referred to as "code-switching" (CS). CS is a complex phenomenon that not only encompasses linguistic challenges, but also contains a great deal of complexity in terms of its dynamic behaviour across speakers. This dynamic behaviour has been studied by sociologists and psychologists, identifying factors a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2112.06462v1-abstract-full').style.display = 'inline'; document.getElementById('2112.06462v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2112.06462v1-abstract-full" style="display: none;"> Multilingual speakers tend to alternate between languages within a conversation, a phenomenon referred to as "code-switching" (CS). CS is a complex phenomenon that not only encompasses linguistic challenges, but also contains a great deal of complexity in terms of its dynamic behaviour across speakers. This dynamic behaviour has been studied by sociologists and psychologists, identifying factors affecting CS. In this paper, we provide an empirical user study on Arabic-English CS, where we show the correlation between users' CS frequency and character traits. We use machine learning (ML) to validate the findings, informing and confirming existing theories. The predictive models were able to predict users' CS frequency with an accuracy higher than 55%, where travel experiences and personality traits played the biggest role in the modeling process. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2112.06462v1-abstract-full').style.display = 'none'; document.getElementById('2112.06462v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 December, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To be published in the proceedings of the International Conference on Asian Language Information Processing</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Vu%2C+N+T&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Vu%2C+N+T&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Vu%2C+N+T&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository