CINXE.COM
Search | arXiv e-print repository
<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–15 of 15 results for author: <span class="mathjax">Casanova, E</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Casanova%2C+E">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Casanova, E"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Casanova%2C+E&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Casanova, E"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.12117">arXiv:2409.12117</a> <span> [<a href="https://arxiv.org/pdf/2409.12117">pdf</a>, <a href="https://arxiv.org/ps/2409.12117">ps</a>, <a href="https://arxiv.org/format/2409.12117">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Low Frame-rate Speech Codec: a Codec Designed for Fast High-quality Speech LLM Training and Inference </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Casanova%2C+E">Edresson Casanova</a>, <a href="/search/cs?searchtype=author&query=Langman%2C+R">Ryan Langman</a>, <a href="/search/cs?searchtype=author&query=Neekhara%2C+P">Paarth Neekhara</a>, <a href="/search/cs?searchtype=author&query=Hussain%2C+S">Shehzeen Hussain</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jason Li</a>, <a href="/search/cs?searchtype=author&query=Ghosh%2C+S">Subhankar Ghosh</a>, <a href="/search/cs?searchtype=author&query=Juki%C4%87%2C+A">Ante Juki膰</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+S">Sang-gil Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.12117v1-abstract-short" style="display: inline;"> Large language models (LLMs) have significantly advanced audio processing through audio codecs that convert audio into discrete tokens, enabling the application of language modeling techniques to audio data. However, audio codecs often operate at high frame rates, resulting in slow training and inference, especially for autoregressive models. To address this challenge, we present the Low Frame-rat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12117v1-abstract-full').style.display = 'inline'; document.getElementById('2409.12117v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.12117v1-abstract-full" style="display: none;"> Large language models (LLMs) have significantly advanced audio processing through audio codecs that convert audio into discrete tokens, enabling the application of language modeling techniques to audio data. However, audio codecs often operate at high frame rates, resulting in slow training and inference, especially for autoregressive models. To address this challenge, we present the Low Frame-rate Speech Codec (LFSC): a neural audio codec that leverages finite scalar quantization and adversarial training with large speech language models to achieve high-quality audio compression with a 1.89 kbps bitrate and 21.5 frames per second. We demonstrate that our novel codec can make the inference of LLM-based text-to-speech models around three times faster while improving intelligibility and producing quality comparable to previous models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12117v1-abstract-full').style.display = 'none'; document.getElementById('2409.12117v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to ICASSP 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.04904">arXiv:2406.04904</a> <span> [<a href="https://arxiv.org/pdf/2406.04904">pdf</a>, <a href="https://arxiv.org/format/2406.04904">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> XTTS: a Massively Multilingual Zero-Shot Text-to-Speech Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Casanova%2C+E">Edresson Casanova</a>, <a href="/search/cs?searchtype=author&query=Davis%2C+K">Kelly Davis</a>, <a href="/search/cs?searchtype=author&query=G%C3%B6lge%2C+E">Eren G枚lge</a>, <a href="/search/cs?searchtype=author&query=G%C3%B6knar%2C+G">G枚rkem G枚knar</a>, <a href="/search/cs?searchtype=author&query=Gulea%2C+I">Iulian Gulea</a>, <a href="/search/cs?searchtype=author&query=Hart%2C+L">Logan Hart</a>, <a href="/search/cs?searchtype=author&query=Aljafari%2C+A">Aya Aljafari</a>, <a href="/search/cs?searchtype=author&query=Meyer%2C+J">Joshua Meyer</a>, <a href="/search/cs?searchtype=author&query=Morais%2C+R">Reuben Morais</a>, <a href="/search/cs?searchtype=author&query=Olayemi%2C+S">Samuel Olayemi</a>, <a href="/search/cs?searchtype=author&query=Weber%2C+J">Julian Weber</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.04904v1-abstract-short" style="display: inline;"> Most Zero-shot Multi-speaker TTS (ZS-TTS) systems support only a single language. Although models like YourTTS, VALL-E X, Mega-TTS 2, and Voicebox explored Multilingual ZS-TTS they are limited to just a few high/medium resource languages, limiting the applications of these models in most of the low/medium resource languages. In this paper, we aim to alleviate this issue by proposing and making pub… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.04904v1-abstract-full').style.display = 'inline'; document.getElementById('2406.04904v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.04904v1-abstract-full" style="display: none;"> Most Zero-shot Multi-speaker TTS (ZS-TTS) systems support only a single language. Although models like YourTTS, VALL-E X, Mega-TTS 2, and Voicebox explored Multilingual ZS-TTS they are limited to just a few high/medium resource languages, limiting the applications of these models in most of the low/medium resource languages. In this paper, we aim to alleviate this issue by proposing and making publicly available the XTTS system. Our method builds upon the Tortoise model and adds several novel modifications to enable multilingual training, improve voice cloning, and enable faster training and inference. XTTS was trained in 16 languages and achieved state-of-the-art (SOTA) results in most of them. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.04904v1-abstract-full').style.display = 'none'; document.getElementById('2406.04904v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at INTERSPEECH 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.09512">arXiv:2401.09512</a> <span> [<a href="https://arxiv.org/pdf/2401.09512">pdf</a>, <a href="https://arxiv.org/format/2401.09512">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> MLAAD: The Multi-Language Audio Anti-Spoofing Dataset </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=M%C3%BCller%2C+N+M">Nicolas M. M眉ller</a>, <a href="/search/cs?searchtype=author&query=Kawa%2C+P">Piotr Kawa</a>, <a href="/search/cs?searchtype=author&query=Choong%2C+W+H">Wei Herng Choong</a>, <a href="/search/cs?searchtype=author&query=Casanova%2C+E">Edresson Casanova</a>, <a href="/search/cs?searchtype=author&query=G%C3%B6lge%2C+E">Eren G枚lge</a>, <a href="/search/cs?searchtype=author&query=M%C3%BCller%2C+T">Thorsten M眉ller</a>, <a href="/search/cs?searchtype=author&query=Syga%2C+P">Piotr Syga</a>, <a href="/search/cs?searchtype=author&query=Sperl%2C+P">Philip Sperl</a>, <a href="/search/cs?searchtype=author&query=B%C3%B6ttinger%2C+K">Konstantin B枚ttinger</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.09512v5-abstract-short" style="display: inline;"> Text-to-Speech (TTS) technology offers notable benefits, such as providing a voice for individuals with speech impairments, but it also facilitates the creation of audio deepfakes and spoofing attacks. AI-based detection methods can help mitigate these risks; however, the performance of such models is inherently dependent on the quality and diversity of their training data. Presently, the availabl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.09512v5-abstract-full').style.display = 'inline'; document.getElementById('2401.09512v5-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.09512v5-abstract-full" style="display: none;"> Text-to-Speech (TTS) technology offers notable benefits, such as providing a voice for individuals with speech impairments, but it also facilitates the creation of audio deepfakes and spoofing attacks. AI-based detection methods can help mitigate these risks; however, the performance of such models is inherently dependent on the quality and diversity of their training data. Presently, the available datasets are heavily skewed towards English and Chinese audio, which limits the global applicability of these anti-spoofing systems. To address this limitation, this paper presents the Multi-Language Audio Anti-Spoof Dataset (MLAAD), created using 82 TTS models, comprising 33 different architectures, to generate 378.0 hours of synthetic voice in 38 different languages. We train and evaluate three state-of-the-art deepfake detection models with MLAAD and observe that it demonstrates superior performance over comparable datasets like InTheWild and Fake- OrReal when used as a training resource. Moreover, compared to the renowned ASVspoof 2019 dataset, MLAAD proves to be a complementary resource. In tests across eight datasets, MLAAD and ASVspoof 2019 alternately outperformed each other, each excelling on four datasets. By publishing MLAAD and making a trained model accessible via an interactive webserver, we aim to democratize anti-spoofing technology, making it accessible beyond the realm of specialists, and contributing to global efforts against audio spoofing and deepfakes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.09512v5-abstract-full').style.display = 'none'; document.getElementById('2401.09512v5-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">IJCNN 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.10097">arXiv:2306.10097</a> <span> [<a href="https://arxiv.org/pdf/2306.10097">pdf</a>, <a href="https://arxiv.org/format/2306.10097">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> CML-TTS A Multilingual Dataset for Speech Synthesis in Low-Resource Languages </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Oliveira%2C+F+S">Frederico S. Oliveira</a>, <a href="/search/cs?searchtype=author&query=Casanova%2C+E">Edresson Casanova</a>, <a href="/search/cs?searchtype=author&query=J%C3%BAnior%2C+A+C">Arnaldo C芒ndido J煤nior</a>, <a href="/search/cs?searchtype=author&query=Soares%2C+A+S">Anderson S. Soares</a>, <a href="/search/cs?searchtype=author&query=Filho%2C+A+R+G">Arlindo R. Galv茫o Filho</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.10097v1-abstract-short" style="display: inline;"> In this paper, we present CML-TTS, a recursive acronym for CML-Multi-Lingual-TTS, a new Text-to-Speech (TTS) dataset developed at the Center of Excellence in Artificial Intelligence (CEIA) of the Federal University of Goias (UFG). CML-TTS is based on Multilingual LibriSpeech (MLS) and adapted for training TTS models, consisting of audiobooks in seven languages: Dutch, French, German, Italian, Port… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.10097v1-abstract-full').style.display = 'inline'; document.getElementById('2306.10097v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.10097v1-abstract-full" style="display: none;"> In this paper, we present CML-TTS, a recursive acronym for CML-Multi-Lingual-TTS, a new Text-to-Speech (TTS) dataset developed at the Center of Excellence in Artificial Intelligence (CEIA) of the Federal University of Goias (UFG). CML-TTS is based on Multilingual LibriSpeech (MLS) and adapted for training TTS models, consisting of audiobooks in seven languages: Dutch, French, German, Italian, Portuguese, Polish, and Spanish. Additionally, we provide the YourTTS model, a multi-lingual TTS model, trained using 3,176.13 hours from CML-TTS and also with 245.07 hours from LibriTTS, in English. Our purpose in creating this dataset is to open up new research possibilities in the TTS area for multi-lingual models. The dataset is publicly available under the CC-BY 4.0 license1. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.10097v1-abstract-full').style.display = 'none'; document.getElementById('2306.10097v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 5 figures, Accepted at the 25th International Conference on Text, Speech and Dialogue (TSD 2022)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.09979">arXiv:2306.09979</a> <span> [<a href="https://arxiv.org/pdf/2306.09979">pdf</a>, <a href="https://arxiv.org/format/2306.09979">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Evaluation of Speech Representations for MOS prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Oliveira%2C+F+S">Frederico S. Oliveira</a>, <a href="/search/cs?searchtype=author&query=Casanova%2C+E">Edresson Casanova</a>, <a href="/search/cs?searchtype=author&query=J%C3%BAnior%2C+A+C">Arnaldo C芒ndido J煤nior</a>, <a href="/search/cs?searchtype=author&query=Gris%2C+L+R+S">Lucas R. S. Gris</a>, <a href="/search/cs?searchtype=author&query=Soares%2C+A+S">Anderson S. Soares</a>, <a href="/search/cs?searchtype=author&query=Filho%2C+A+R+G">Arlindo R. Galv茫o Filho</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.09979v1-abstract-short" style="display: inline;"> In this paper, we evaluate feature extraction models for predicting speech quality. We also propose a model architecture to compare embeddings of supervised learning and self-supervised learning models with embeddings of speaker verification models to predict the metric MOS. Our experiments were performed on the VCC2018 dataset and a Brazilian-Portuguese dataset called BRSpeechMOS, which was creat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.09979v1-abstract-full').style.display = 'inline'; document.getElementById('2306.09979v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.09979v1-abstract-full" style="display: none;"> In this paper, we evaluate feature extraction models for predicting speech quality. We also propose a model architecture to compare embeddings of supervised learning and self-supervised learning models with embeddings of speaker verification models to predict the metric MOS. Our experiments were performed on the VCC2018 dataset and a Brazilian-Portuguese dataset called BRSpeechMOS, which was created for this work. The results show that the Whisper model is appropriate in all scenarios: with both the VCC2018 and BRSpeech- MOS datasets. Among the supervised and self-supervised learning models using BRSpeechMOS, Whisper-Small achieved the best linear correlation of 0.6980, and the speaker verification model, SpeakerNet, had linear correlation of 0.6963. Using VCC2018, the best supervised and self-supervised learning model, Whisper-Large, achieved linear correlation of 0.7274, and the best model speaker verification, TitaNet, achieved a linear correlation of 0.6933. Although the results of the speaker verification models are slightly lower, the SpeakerNet model has only 5M parameters, making it suitable for real-time applications, and the TitaNet model produces an embedding of size 192, the smallest among all the evaluated models. The experiment results are reproducible with publicly available source-code1 . <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.09979v1-abstract-full').style.display = 'none'; document.getElementById('2306.09979v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 4 figures, Accepted to the 26th International Conference of Text, Speech and Dialogue (TSD2023)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.14580">arXiv:2305.14580</a> <span> [<a href="https://arxiv.org/pdf/2305.14580">pdf</a>, <a href="https://arxiv.org/format/2305.14580">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Evaluating OpenAI's Whisper ASR for Punctuation Prediction and Topic Modeling of life histories of the Museum of the Person </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gris%2C+L+R+S">Lucas Rafael Stefanel Gris</a>, <a href="/search/cs?searchtype=author&query=Marcacini%2C+R">Ricardo Marcacini</a>, <a href="/search/cs?searchtype=author&query=Junior%2C+A+C">Arnaldo Candido Junior</a>, <a href="/search/cs?searchtype=author&query=Casanova%2C+E">Edresson Casanova</a>, <a href="/search/cs?searchtype=author&query=Soares%2C+A">Anderson Soares</a>, <a href="/search/cs?searchtype=author&query=Alu%C3%ADsio%2C+S+M">Sandra Maria Alu铆sio</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.14580v2-abstract-short" style="display: inline;"> Automatic speech recognition (ASR) systems play a key role in applications involving human-machine interactions. Despite their importance, ASR models for the Portuguese language proposed in the last decade have limitations in relation to the correct identification of punctuation marks in automatic transcriptions, which hinder the use of transcriptions by other systems, models, and even by humans.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.14580v2-abstract-full').style.display = 'inline'; document.getElementById('2305.14580v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.14580v2-abstract-full" style="display: none;"> Automatic speech recognition (ASR) systems play a key role in applications involving human-machine interactions. Despite their importance, ASR models for the Portuguese language proposed in the last decade have limitations in relation to the correct identification of punctuation marks in automatic transcriptions, which hinder the use of transcriptions by other systems, models, and even by humans. However, recently Whisper ASR was proposed by OpenAI, a general-purpose speech recognition model that has generated great expectations in dealing with such limitations. This chapter presents the first study on the performance of Whisper for punctuation prediction in the Portuguese language. We present an experimental evaluation considering both theoretical aspects involving pausing points (comma) and complete ideas (exclamation, question, and fullstop), as well as practical aspects involving transcript-based topic modeling - an application dependent on punctuation marks for promising performance. We analyzed experimental results from videos of Museum of the Person, a virtual museum that aims to tell and preserve people's life histories, thus discussing the pros and cons of Whisper in a real-world scenario. Although our experiments indicate that Whisper achieves state-of-the-art results, we conclude that some punctuation marks require improvements, such as exclamation, semicolon and colon. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.14580v2-abstract-full').style.display = 'none'; document.getElementById('2305.14580v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2211.14372">arXiv:2211.14372</a> <span> [<a href="https://arxiv.org/pdf/2211.14372">pdf</a>, <a href="https://arxiv.org/format/2211.14372">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Interpretability Analysis of Deep Models for COVID-19 Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=da+Silva%2C+D+P+P">Daniel Peixoto Pinto da Silva</a>, <a href="/search/cs?searchtype=author&query=Casanova%2C+E">Edresson Casanova</a>, <a href="/search/cs?searchtype=author&query=Gris%2C+L+R+S">Lucas Rafael Stefanel Gris</a>, <a href="/search/cs?searchtype=author&query=Junior%2C+A+C">Arnaldo Candido Junior</a>, <a href="/search/cs?searchtype=author&query=Finger%2C+M">Marcelo Finger</a>, <a href="/search/cs?searchtype=author&query=Svartman%2C+F">Flaviane Svartman</a>, <a href="/search/cs?searchtype=author&query=Raposo%2C+B">Beatriz Raposo</a>, <a href="/search/cs?searchtype=author&query=Martins%2C+M+V+M">Marcus Vin铆cius Moreira Martins</a>, <a href="/search/cs?searchtype=author&query=Alu%C3%ADsio%2C+S+M">Sandra Maria Alu铆sio</a>, <a href="/search/cs?searchtype=author&query=Berti%2C+L+C">Larissa Cristina Berti</a>, <a href="/search/cs?searchtype=author&query=Teixeira%2C+J+P">Jo茫o Paulo Teixeira</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2211.14372v1-abstract-short" style="display: inline;"> During the outbreak of COVID-19 pandemic, several research areas joined efforts to mitigate the damages caused by SARS-CoV-2. In this paper we present an interpretability analysis of a convolutional neural network based model for COVID-19 detection in audios. We investigate which features are important for model decision process, investigating spectrograms, F0, F0 standard deviation, sex and age.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.14372v1-abstract-full').style.display = 'inline'; document.getElementById('2211.14372v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2211.14372v1-abstract-full" style="display: none;"> During the outbreak of COVID-19 pandemic, several research areas joined efforts to mitigate the damages caused by SARS-CoV-2. In this paper we present an interpretability analysis of a convolutional neural network based model for COVID-19 detection in audios. We investigate which features are important for model decision process, investigating spectrograms, F0, F0 standard deviation, sex and age. Following, we analyse model decisions by generating heat maps for the trained models to capture their attention during the decision process. Focusing on a explainable Inteligence Artificial approach, we show that studied models can taken unbiased decisions even in the presence of spurious data in the training set, given the adequate preprocessing steps. Our best model has 94.44% of accuracy in detection, with results indicating that models favors spectrograms for the decision process, particularly, high energy areas in the spectrogram related to prosodic domains, while F0 also leads to efficient COVID-19 detection. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.14372v1-abstract-full').style.display = 'none'; document.getElementById('2211.14372v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages, 4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2207.03546">arXiv:2207.03546</a> <span> [<a href="https://arxiv.org/pdf/2207.03546">pdf</a>, <a href="https://arxiv.org/format/2207.03546">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> BibleTTS: a large, high-fidelity, multilingual, and uniquely African speech corpus </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Meyer%2C+J">Josh Meyer</a>, <a href="/search/cs?searchtype=author&query=Adelani%2C+D+I">David Ifeoluwa Adelani</a>, <a href="/search/cs?searchtype=author&query=Casanova%2C+E">Edresson Casanova</a>, <a href="/search/cs?searchtype=author&query=%C3%96ktem%2C+A">Alp 脰ktem</a>, <a href="/search/cs?searchtype=author&query=Weber%2C+D+W+J">Daniel Whitenack Julian Weber</a>, <a href="/search/cs?searchtype=author&query=Kabongo%2C+S">Salomon Kabongo</a>, <a href="/search/cs?searchtype=author&query=Salesky%2C+E">Elizabeth Salesky</a>, <a href="/search/cs?searchtype=author&query=Orife%2C+I">Iroro Orife</a>, <a href="/search/cs?searchtype=author&query=Leong%2C+C">Colin Leong</a>, <a href="/search/cs?searchtype=author&query=Ogayo%2C+P">Perez Ogayo</a>, <a href="/search/cs?searchtype=author&query=Emezue%2C+C">Chris Emezue</a>, <a href="/search/cs?searchtype=author&query=Mukiibi%2C+J">Jonathan Mukiibi</a>, <a href="/search/cs?searchtype=author&query=Osei%2C+S">Salomey Osei</a>, <a href="/search/cs?searchtype=author&query=Agbolo%2C+A">Apelete Agbolo</a>, <a href="/search/cs?searchtype=author&query=Akinode%2C+V">Victor Akinode</a>, <a href="/search/cs?searchtype=author&query=Opoku%2C+B">Bernard Opoku</a>, <a href="/search/cs?searchtype=author&query=Olanrewaju%2C+S">Samuel Olanrewaju</a>, <a href="/search/cs?searchtype=author&query=Alabi%2C+J">Jesujoba Alabi</a>, <a href="/search/cs?searchtype=author&query=Muhammad%2C+S">Shamsuddeen Muhammad</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2207.03546v1-abstract-short" style="display: inline;"> BibleTTS is a large, high-quality, open speech dataset for ten languages spoken in Sub-Saharan Africa. The corpus contains up to 86 hours of aligned, studio quality 48kHz single speaker recordings per language, enabling the development of high-quality text-to-speech models. The ten languages represented are: Akuapem Twi, Asante Twi, Chichewa, Ewe, Hausa, Kikuyu, Lingala, Luganda, Luo, and Yoruba.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2207.03546v1-abstract-full').style.display = 'inline'; document.getElementById('2207.03546v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2207.03546v1-abstract-full" style="display: none;"> BibleTTS is a large, high-quality, open speech dataset for ten languages spoken in Sub-Saharan Africa. The corpus contains up to 86 hours of aligned, studio quality 48kHz single speaker recordings per language, enabling the development of high-quality text-to-speech models. The ten languages represented are: Akuapem Twi, Asante Twi, Chichewa, Ewe, Hausa, Kikuyu, Lingala, Luganda, Luo, and Yoruba. This corpus is a derivative work of Bible recordings made and released by the Open.Bible project from Biblica. We have aligned, cleaned, and filtered the original recordings, and additionally hand-checked a subset of the alignments for each language. We present results for text-to-speech models with Coqui TTS. The data is released under a commercial-friendly CC-BY-SA license. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2207.03546v1-abstract-full').style.display = 'none'; document.getElementById('2207.03546v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 July, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to INTERSPEECH 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2204.00618">arXiv:2204.00618</a> <span> [<a href="https://arxiv.org/pdf/2204.00618">pdf</a>, <a href="https://arxiv.org/format/2204.00618">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> ASR data augmentation in low-resource settings using cross-lingual multi-speaker TTS and cross-lingual voice conversion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Casanova%2C+E">Edresson Casanova</a>, <a href="/search/cs?searchtype=author&query=Shulby%2C+C">Christopher Shulby</a>, <a href="/search/cs?searchtype=author&query=Korolev%2C+A">Alexander Korolev</a>, <a href="/search/cs?searchtype=author&query=Junior%2C+A+C">Arnaldo Candido Junior</a>, <a href="/search/cs?searchtype=author&query=Soares%2C+A+d+S">Anderson da Silva Soares</a>, <a href="/search/cs?searchtype=author&query=Alu%C3%ADsio%2C+S">Sandra Alu铆sio</a>, <a href="/search/cs?searchtype=author&query=Ponti%2C+M+A">Moacir Antonelli Ponti</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2204.00618v5-abstract-short" style="display: inline;"> We explore cross-lingual multi-speaker speech synthesis and cross-lingual voice conversion applied to data augmentation for automatic speech recognition (ASR) systems in low/medium-resource scenarios. Through extensive experiments, we show that our approach permits the application of speech synthesis and voice conversion to improve ASR systems using only one target-language speaker during model tr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2204.00618v5-abstract-full').style.display = 'inline'; document.getElementById('2204.00618v5-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2204.00618v5-abstract-full" style="display: none;"> We explore cross-lingual multi-speaker speech synthesis and cross-lingual voice conversion applied to data augmentation for automatic speech recognition (ASR) systems in low/medium-resource scenarios. Through extensive experiments, we show that our approach permits the application of speech synthesis and voice conversion to improve ASR systems using only one target-language speaker during model training. We also managed to close the gap between ASR models trained with synthesized versus human speech compared to other works that use many speakers. Finally, we show that it is possible to obtain promising ASR training results with our data augmentation method using only a single real speaker in a target language. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2204.00618v5-abstract-full').style.display = 'none'; document.getElementById('2204.00618v5-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 March, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper was accepted at INTERSPEECH 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2112.02418">arXiv:2112.02418</a> <span> [<a href="https://arxiv.org/pdf/2112.02418">pdf</a>, <a href="https://arxiv.org/format/2112.02418">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> YourTTS: Towards Zero-Shot Multi-Speaker TTS and Zero-Shot Voice Conversion for everyone </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Casanova%2C+E">Edresson Casanova</a>, <a href="/search/cs?searchtype=author&query=Weber%2C+J">Julian Weber</a>, <a href="/search/cs?searchtype=author&query=Shulby%2C+C">Christopher Shulby</a>, <a href="/search/cs?searchtype=author&query=Junior%2C+A+C">Arnaldo Candido Junior</a>, <a href="/search/cs?searchtype=author&query=G%C3%B6lge%2C+E">Eren G枚lge</a>, <a href="/search/cs?searchtype=author&query=Ponti%2C+M+A">Moacir Antonelli Ponti</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2112.02418v4-abstract-short" style="display: inline;"> YourTTS brings the power of a multilingual approach to the task of zero-shot multi-speaker TTS. Our method builds upon the VITS model and adds several novel modifications for zero-shot multi-speaker and multilingual training. We achieved state-of-the-art (SOTA) results in zero-shot multi-speaker TTS and results comparable to SOTA in zero-shot voice conversion on the VCTK dataset. Additionally, our… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2112.02418v4-abstract-full').style.display = 'inline'; document.getElementById('2112.02418v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2112.02418v4-abstract-full" style="display: none;"> YourTTS brings the power of a multilingual approach to the task of zero-shot multi-speaker TTS. Our method builds upon the VITS model and adds several novel modifications for zero-shot multi-speaker and multilingual training. We achieved state-of-the-art (SOTA) results in zero-shot multi-speaker TTS and results comparable to SOTA in zero-shot voice conversion on the VCTK dataset. Additionally, our approach achieves promising results in a target language with a single-speaker dataset, opening possibilities for zero-shot multi-speaker TTS and zero-shot voice conversion systems in low-resource languages. Finally, it is possible to fine-tune the YourTTS model with less than 1 minute of speech and achieve state-of-the-art results in voice similarity and with reasonable quality. This is important to allow synthesis for speakers with a very different voice or recording characteristics from those seen during training. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2112.02418v4-abstract-full').style.display = 'none'; document.getElementById('2112.02418v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 April, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 December, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">An Erratum was added on the last page of this paper</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Proceedings of the 39th International Conference on Machine Learning, PMLR 162:2709-2720, 2022 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2110.15731">arXiv:2110.15731</a> <span> [<a href="https://arxiv.org/pdf/2110.15731">pdf</a>, <a href="https://arxiv.org/format/2110.15731">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> CORAA: a large corpus of spontaneous and prepared speech manually validated for speech recognition in Brazilian Portuguese </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Junior%2C+A+C">Arnaldo Candido Junior</a>, <a href="/search/cs?searchtype=author&query=Casanova%2C+E">Edresson Casanova</a>, <a href="/search/cs?searchtype=author&query=Soares%2C+A">Anderson Soares</a>, <a href="/search/cs?searchtype=author&query=de+Oliveira%2C+F+S">Frederico Santos de Oliveira</a>, <a href="/search/cs?searchtype=author&query=Oliveira%2C+L">Lucas Oliveira</a>, <a href="/search/cs?searchtype=author&query=Junior%2C+R+C+F">Ricardo Corso Fernandes Junior</a>, <a href="/search/cs?searchtype=author&query=da+Silva%2C+D+P+P">Daniel Peixoto Pinto da Silva</a>, <a href="/search/cs?searchtype=author&query=Fayet%2C+F+G">Fernando Gorgulho Fayet</a>, <a href="/search/cs?searchtype=author&query=Carlotto%2C+B+B">Bruno Baldissera Carlotto</a>, <a href="/search/cs?searchtype=author&query=Gris%2C+L+R+S">Lucas Rafael Stefanel Gris</a>, <a href="/search/cs?searchtype=author&query=Alu%C3%ADsio%2C+S+M">Sandra Maria Alu铆sio</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2110.15731v2-abstract-short" style="display: inline;"> Automatic Speech recognition (ASR) is a complex and challenging task. In recent years, there have been significant advances in the area. In particular, for the Brazilian Portuguese (BP) language, there were about 376 hours public available for ASR task until the second half of 2020. With the release of new datasets in early 2021, this number increased to 574 hours. The existing resources, however,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.15731v2-abstract-full').style.display = 'inline'; document.getElementById('2110.15731v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2110.15731v2-abstract-full" style="display: none;"> Automatic Speech recognition (ASR) is a complex and challenging task. In recent years, there have been significant advances in the area. In particular, for the Brazilian Portuguese (BP) language, there were about 376 hours public available for ASR task until the second half of 2020. With the release of new datasets in early 2021, this number increased to 574 hours. The existing resources, however, are composed of audios containing only read and prepared speech. There is a lack of datasets including spontaneous speech, which are essential in different ASR applications. This paper presents CORAA (Corpus of Annotated Audios) v1. with 290.77 hours, a publicly available dataset for ASR in BP containing validated pairs (audio-transcription). CORAA also contains European Portuguese audios (4.69 hours). We also present a public ASR model based on Wav2Vec 2.0 XLSR-53 and fine-tuned over CORAA. Our model achieved a Word Error Rate of 24.18% on CORAA test set and 20.08% on Common Voice test set. When measuring the Character Error Rate, we obtained 11.02% and 6.34% for CORAA and Common Voice, respectively. CORAA corpora were assembled to both improve ASR models in BP with phenomena from spontaneous speech and motivate young researchers to start their studies on ASR for Portuguese. All the corpora are publicly available at https://github.com/nilc-nlp/CORAA under the CC BY-NC-ND 4.0 license. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.15731v2-abstract-full').style.display = 'none'; document.getElementById('2110.15731v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 October, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper is under consideration at Language Resources and Evaluation (LREV)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2107.11414">arXiv:2107.11414</a> <span> [<a href="https://arxiv.org/pdf/2107.11414">pdf</a>, <a href="https://arxiv.org/format/2107.11414">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Brazilian Portuguese Speech Recognition Using Wav2vec 2.0 </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gris%2C+L+R+S">Lucas Rafael Stefanel Gris</a>, <a href="/search/cs?searchtype=author&query=Casanova%2C+E">Edresson Casanova</a>, <a href="/search/cs?searchtype=author&query=de+Oliveira%2C+F+S">Frederico Santos de Oliveira</a>, <a href="/search/cs?searchtype=author&query=Soares%2C+A+d+S">Anderson da Silva Soares</a>, <a href="/search/cs?searchtype=author&query=Junior%2C+A+C">Arnaldo Candido Junior</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2107.11414v3-abstract-short" style="display: inline;"> Deep learning techniques have been shown to be efficient in various tasks, especially in the development of speech recognition systems, that is, systems that aim to transcribe an audio sentence in a sequence of written words. Despite the progress in the area, speech recognition can still be considered difficult, especially for languages lacking available data, such as Brazilian Portuguese (BP). In… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2107.11414v3-abstract-full').style.display = 'inline'; document.getElementById('2107.11414v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2107.11414v3-abstract-full" style="display: none;"> Deep learning techniques have been shown to be efficient in various tasks, especially in the development of speech recognition systems, that is, systems that aim to transcribe an audio sentence in a sequence of written words. Despite the progress in the area, speech recognition can still be considered difficult, especially for languages lacking available data, such as Brazilian Portuguese (BP). In this sense, this work presents the development of an public Automatic Speech Recognition (ASR) system using only open available audio data, from the fine-tuning of the Wav2vec 2.0 XLSR-53 model pre-trained in many languages, over BP data. The final model presents an average word error rate of 12.4% over 7 different datasets (10.5% when applying a language model). According to our knowledge, the obtained error is the lowest among open end-to-end (E2E) ASR models for BP. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2107.11414v3-abstract-full').style.display = 'none'; document.getElementById('2107.11414v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 December, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 July, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2104.05557">arXiv:2104.05557</a> <span> [<a href="https://arxiv.org/pdf/2104.05557">pdf</a>, <a href="https://arxiv.org/format/2104.05557">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> SC-GlowTTS: an Efficient Zero-Shot Multi-Speaker Text-To-Speech Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Casanova%2C+E">Edresson Casanova</a>, <a href="/search/cs?searchtype=author&query=Shulby%2C+C">Christopher Shulby</a>, <a href="/search/cs?searchtype=author&query=G%C3%B6lge%2C+E">Eren G枚lge</a>, <a href="/search/cs?searchtype=author&query=M%C3%BCller%2C+N+M">Nicolas Michael M眉ller</a>, <a href="/search/cs?searchtype=author&query=de+Oliveira%2C+F+S">Frederico Santos de Oliveira</a>, <a href="/search/cs?searchtype=author&query=Junior%2C+A+C">Arnaldo Candido Junior</a>, <a href="/search/cs?searchtype=author&query=Soares%2C+A+d+S">Anderson da Silva Soares</a>, <a href="/search/cs?searchtype=author&query=Aluisio%2C+S+M">Sandra Maria Aluisio</a>, <a href="/search/cs?searchtype=author&query=Ponti%2C+M+A">Moacir Antonelli Ponti</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2104.05557v2-abstract-short" style="display: inline;"> In this paper, we propose SC-GlowTTS: an efficient zero-shot multi-speaker text-to-speech model that improves similarity for speakers unseen during training. We propose a speaker-conditional architecture that explores a flow-based decoder that works in a zero-shot scenario. As text encoders, we explore a dilated residual convolutional-based encoder, gated convolutional-based encoder, and transform… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2104.05557v2-abstract-full').style.display = 'inline'; document.getElementById('2104.05557v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2104.05557v2-abstract-full" style="display: none;"> In this paper, we propose SC-GlowTTS: an efficient zero-shot multi-speaker text-to-speech model that improves similarity for speakers unseen during training. We propose a speaker-conditional architecture that explores a flow-based decoder that works in a zero-shot scenario. As text encoders, we explore a dilated residual convolutional-based encoder, gated convolutional-based encoder, and transformer-based encoder. Additionally, we have shown that adjusting a GAN-based vocoder for the spectrograms predicted by the TTS model on the training dataset can significantly improve the similarity and speech quality for new speakers. Our model converges using only 11 speakers, reaching state-of-the-art results for similarity with new speakers, as well as high speech quality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2104.05557v2-abstract-full').style.display = 'none'; document.getElementById('2104.05557v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 June, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 April, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted on Interspeech 2021</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2005.05144">arXiv:2005.05144</a> <span> [<a href="https://arxiv.org/pdf/2005.05144">pdf</a>, <a href="https://arxiv.org/format/2005.05144">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1007/s10579-021-09570-4">10.1007/s10579-021-09570-4 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> TTS-Portuguese Corpus: a corpus for speech synthesis in Brazilian Portuguese </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Casanova%2C+E">Edresson Casanova</a>, <a href="/search/cs?searchtype=author&query=Junior%2C+A+C">Arnaldo Candido Junior</a>, <a href="/search/cs?searchtype=author&query=Shulby%2C+C">Christopher Shulby</a>, <a href="/search/cs?searchtype=author&query=de+Oliveira%2C+F+S">Frederico Santos de Oliveira</a>, <a href="/search/cs?searchtype=author&query=Teixeira%2C+J+P">Jo茫o Paulo Teixeira</a>, <a href="/search/cs?searchtype=author&query=Ponti%2C+M+A">Moacir Antonelli Ponti</a>, <a href="/search/cs?searchtype=author&query=Aluisio%2C+S+M">Sandra Maria Aluisio</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2005.05144v4-abstract-short" style="display: inline;"> Speech provides a natural way for human-computer interaction. In particular, speech synthesis systems are popular in different applications, such as personal assistants, GPS applications, screen readers and accessibility tools. However, not all languages are on the same level when in terms of resources and systems for speech synthesis. This work consists of creating publicly available resources fo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2005.05144v4-abstract-full').style.display = 'inline'; document.getElementById('2005.05144v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2005.05144v4-abstract-full" style="display: none;"> Speech provides a natural way for human-computer interaction. In particular, speech synthesis systems are popular in different applications, such as personal assistants, GPS applications, screen readers and accessibility tools. However, not all languages are on the same level when in terms of resources and systems for speech synthesis. This work consists of creating publicly available resources for Brazilian Portuguese in the form of a novel dataset along with deep learning models for end-to-end speech synthesis. Such dataset has 10.5 hours from a single speaker, from which a Tacotron 2 model with the RTISI-LA vocoder presented the best performance, achieving a 4.03 MOS value. The obtained results are comparable to related works covering English language and the state-of-the-art in Portuguese. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2005.05144v4-abstract-full').style.display = 'none'; document.getElementById('2005.05144v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 January, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 May, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2020. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2002.11213">arXiv:2002.11213</a> <span> [<a href="https://arxiv.org/pdf/2002.11213">pdf</a>, <a href="https://arxiv.org/format/2002.11213">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Speech2Phone: A Novel and Efficient Method for Training Speaker Recognition Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Casanova%2C+E">Edresson Casanova</a>, <a href="/search/cs?searchtype=author&query=Junior%2C+A+C">Arnaldo Candido Junior</a>, <a href="/search/cs?searchtype=author&query=Shulby%2C+C">Christopher Shulby</a>, <a href="/search/cs?searchtype=author&query=de+Oliveira%2C+F+S">Frederico Santos de Oliveira</a>, <a href="/search/cs?searchtype=author&query=Gris%2C+L+R+S">Lucas Rafael Stefanel Gris</a>, <a href="/search/cs?searchtype=author&query=da+Silva%2C+H+P">Hamilton Pereira da Silva</a>, <a href="/search/cs?searchtype=author&query=Aluisio%2C+S+M">Sandra Maria Aluisio</a>, <a href="/search/cs?searchtype=author&query=Ponti%2C+M+A">Moacir Antonelli Ponti</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2002.11213v2-abstract-short" style="display: inline;"> In this paper we present an efficient method for training models for speaker recognition using small or under-resourced datasets. This method requires less data than other SOTA (State-Of-The-Art) methods, e.g. the Angular Prototypical and GE2E loss functions, while achieving similar results to those methods. This is done using the knowledge of the reconstruction of a phoneme in the speaker's voice… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2002.11213v2-abstract-full').style.display = 'inline'; document.getElementById('2002.11213v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2002.11213v2-abstract-full" style="display: none;"> In this paper we present an efficient method for training models for speaker recognition using small or under-resourced datasets. This method requires less data than other SOTA (State-Of-The-Art) methods, e.g. the Angular Prototypical and GE2E loss functions, while achieving similar results to those methods. This is done using the knowledge of the reconstruction of a phoneme in the speaker's voice. For this purpose, a new dataset was built, composed of 40 male speakers, who read sentences in Portuguese, totaling approximately 3h. We compare the three best architectures trained using our method to select the best one, which is the one with a shallow architecture. Then, we compared this model with the SOTA method for the speaker recognition task: the Fast ResNet-34 trained with approximately 2,000 hours, using the loss functions Angular Prototypical and GE2E. Three experiments were carried out with datasets in different languages. Among these three experiments, our model achieved the second best result in two experiments and the best result in one of them. This highlights the importance of our method, which proved to be a great competitor to SOTA speaker recognition models, with 500x less data and a simpler approach. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2002.11213v2-abstract-full').style.display = 'none'; document.getElementById('2002.11213v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 June, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 February, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to BRACIS</span> </p> </li> </ol> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>