Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 97 results for author: <span class="mathjax">Toda, T</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Toda%2C+T">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Toda, T"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Toda%2C+T&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Toda, T"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Toda%2C+T&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Toda%2C+T&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Toda%2C+T&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.18486">arXiv:2503.18486</a> <span> [<a href="https://arxiv.org/pdf/2503.18486">pdf</a>, <a href="https://arxiv.org/format/2503.18486">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Music Similarity Representation Learning Focusing on Individual Instruments with Source Separation and Human Preference </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Imamura%2C+T">Takehiro Imamura</a>, <a href="/search/cs?searchtype=author&query=Hashizume%2C+Y">Yuka Hashizume</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+W">Wen-Chin Huang</a>, <a href="/search/cs?searchtype=author&query=Toda%2C+T">Tomoki Toda</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.18486v1-abstract-short" style="display: inline;"> This paper proposes music similarity representation learning (MSRL) based on individual instrument sounds (InMSRL) utilizing music source separation (MSS) and human preference without requiring clean instrument sounds during inference. We propose three methods that effectively improve performance. First, we introduce end-to-end fine-tuning (E2E-FT) for the Cascade approach that sequentially perfor… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18486v1-abstract-full').style.display = 'inline'; document.getElementById('2503.18486v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.18486v1-abstract-full" style="display: none;"> This paper proposes music similarity representation learning (MSRL) based on individual instrument sounds (InMSRL) utilizing music source separation (MSS) and human preference without requiring clean instrument sounds during inference. We propose three methods that effectively improve performance. First, we introduce end-to-end fine-tuning (E2E-FT) for the Cascade approach that sequentially performs MSS and music similarity feature extraction. E2E-FT allows the model to minimize the adverse effects of a separation error on the feature extraction. Second, we propose multi-task learning for the Direct approach that directly extracts disentangled music similarity features using a single music similarity feature extractor. Multi-task learning, which is based on the disentangled music similarity feature extraction and MSS based on reconstruction with disentangled music similarity features, further enhances instrument feature disentanglement. Third, we employ perception-aware fine-tuning (PAFT). PAFT utilizes human preference, allowing the model to perform InMSRL aligned with human perceptual similarity. We conduct experimental evaluations and demonstrate that 1) E2E-FT for Cascade significantly improves InMSRL performance, 2) the multi-task learning for Direct is also helpful to improve disentanglement performance in the feature extraction, 3) PAFT significantly enhances the perceptual InMSRL performance, and 4) Cascade with E2E-FT and PAFT outperforms Direct with the multi-task learning and PAFT. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18486v1-abstract-full').style.display = 'none'; document.getElementById('2503.18486v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.17281">arXiv:2503.17281</a> <span> [<a href="https://arxiv.org/pdf/2503.17281">pdf</a>, <a href="https://arxiv.org/format/2503.17281">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Learning disentangled representations for instrument-based music similarity </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hashizume%2C+Y">Yuka Hashizume</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Li Li</a>, <a href="/search/cs?searchtype=author&query=Miyashita%2C+A">Atsushi Miyashita</a>, <a href="/search/cs?searchtype=author&query=Toda%2C+T">Tomoki Toda</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.17281v1-abstract-short" style="display: inline;"> A flexible recommendation and retrieval system requires music similarity in terms of multiple partial elements of musical pieces to allow users to select the element they want to focus on. A method for music similarity learning using multiple networks with individual instrumental signals is effective but faces the problem that using each clean instrumental signal as a query is impractical for retr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17281v1-abstract-full').style.display = 'inline'; document.getElementById('2503.17281v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.17281v1-abstract-full" style="display: none;"> A flexible recommendation and retrieval system requires music similarity in terms of multiple partial elements of musical pieces to allow users to select the element they want to focus on. A method for music similarity learning using multiple networks with individual instrumental signals is effective but faces the problem that using each clean instrumental signal as a query is impractical for retrieval systems and using separated instrumental sounds reduces accuracy owing to artifacts. In this paper, we present instrumental-part-based music similarity learning with a single network that takes mixed sounds as input instead of individual instrumental sounds. Specifically, we designed a single similarity embedding space with disentangled dimensions for each instrument, extracted by Conditional Similarity Networks, which are trained using the triplet loss with masks. Experimental results showed that (1) the proposed method can obtain more accurate feature representation than using individual networks using separated sounds as input in the evaluation of an instrument that had low accuracy, (2) each sub-embedding space can hold the characteristics of the corresponding instrument, and (3) the selection of similar musical pieces focusing on each instrumental sound by the proposed method can obtain human acceptance, especially when focusing on timbre. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17281v1-abstract-full').style.display = 'none'; document.getElementById('2503.17281v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">arXiv admin note: text overlap with arXiv:2404.06682</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.12388">arXiv:2503.12388</a> <span> [<a href="https://arxiv.org/pdf/2503.12388">pdf</a>, <a href="https://arxiv.org/format/2503.12388">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Serenade: A Singing Style Conversion Framework Based On Audio Infilling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Violeta%2C+L+P">Lester Phillip Violeta</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+W">Wen-Chin Huang</a>, <a href="/search/cs?searchtype=author&query=Toda%2C+T">Tomoki Toda</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.12388v1-abstract-short" style="display: inline;"> We propose Serenade, a novel framework for the singing style conversion (SSC) task. Although singer identity conversion has made great strides in the previous years, converting the singing style of a singer has been an unexplored research area. We find three main challenges in SSC: modeling the target style, disentangling source style, and retaining the source melody. To model the target singing s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12388v1-abstract-full').style.display = 'inline'; document.getElementById('2503.12388v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.12388v1-abstract-full" style="display: none;"> We propose Serenade, a novel framework for the singing style conversion (SSC) task. Although singer identity conversion has made great strides in the previous years, converting the singing style of a singer has been an unexplored research area. We find three main challenges in SSC: modeling the target style, disentangling source style, and retaining the source melody. To model the target singing style, we use an audio infilling task by predicting a masked segment of the target mel-spectrogram with a flow-matching model using the complement of the masked target mel-spectrogram along with disentangled acoustic features. On the other hand, to disentangle the source singing style, we use a cyclic training approach, where we use synthetic converted samples as source inputs and reconstruct the original source mel-spectrogram as a target. Finally, to retain the source melody better, we investigate a post-processing module using a source-filter-based vocoder and resynthesize the converted waveforms using the original F0 patterns. Our results showed that the Serenade framework can handle generalized SSC tasks with the best overall similarity score, especially in modeling breathy and mixed singing styles. Moreover, although resynthesizing with the original F0 patterns alleviated out-of-tune singing and improved naturalness, we found a slight tradeoff in similarity due to not changing the F0 patterns into the target style. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12388v1-abstract-full').style.display = 'none'; document.getElementById('2503.12388v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Preprint under review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.10435">arXiv:2503.10435</a> <span> [<a href="https://arxiv.org/pdf/2503.10435">pdf</a>, <a href="https://arxiv.org/format/2503.10435">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Handling Domain Shifts for Anomalous Sound Detection: A Review of DCASE-Related Work </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wilkinghoff%2C+K">Kevin Wilkinghoff</a>, <a href="/search/cs?searchtype=author&query=Fujimura%2C+T">Takuya Fujimura</a>, <a href="/search/cs?searchtype=author&query=Imoto%2C+K">Keisuke Imoto</a>, <a href="/search/cs?searchtype=author&query=Roux%2C+J+L">Jonathan Le Roux</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+Z">Zheng-Hua Tan</a>, <a href="/search/cs?searchtype=author&query=Toda%2C+T">Tomoki Toda</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.10435v1-abstract-short" style="display: inline;"> When detecting anomalous sounds in complex environments, one of the main difficulties is that trained models must be sensitive to subtle differences in monitored target signals, while many practical applications also require them to be insensitive to changes in acoustic domains. Examples of such domain shifts include changing the type of microphone or the location of acoustic sensors, which can ha… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10435v1-abstract-full').style.display = 'inline'; document.getElementById('2503.10435v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.10435v1-abstract-full" style="display: none;"> When detecting anomalous sounds in complex environments, one of the main difficulties is that trained models must be sensitive to subtle differences in monitored target signals, while many practical applications also require them to be insensitive to changes in acoustic domains. Examples of such domain shifts include changing the type of microphone or the location of acoustic sensors, which can have a much stronger impact on the acoustic signal than subtle anomalies themselves. Moreover, users typically aim to train a model only on source domain data, which they may have a relatively large collection of, and they hope that such a trained model will be able to generalize well to an unseen target domain by providing only a minimal number of samples to characterize the acoustic signals in that domain. In this work, we review and discuss recent publications focusing on this domain generalization problem for anomalous sound detection in the context of the DCASE challenges on acoustic machine condition monitoring. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10435v1-abstract-full').style.display = 'none'; document.getElementById('2503.10435v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.02138">arXiv:2502.02138</a> <span> [<a href="https://arxiv.org/pdf/2502.02138">pdf</a>, <a href="https://arxiv.org/format/2502.02138">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Investigation of perceptual music similarity focusing on each instrumental part </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hashizume%2C+Y">Yuka Hashizume</a>, <a href="/search/cs?searchtype=author&query=Toda%2C+T">Tomoki Toda</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.02138v1-abstract-short" style="display: inline;"> This paper presents an investigation of perceptual similarity between music tracks focusing on each individual instrumental part based on a large-scale listening test towards developing an instrumental-part-based music retrieval. In the listening test, 586 subjects evaluate the perceptual similarity of the audio tracks through an ABX test. We use the music tracks and their stems in the test set of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02138v1-abstract-full').style.display = 'inline'; document.getElementById('2502.02138v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.02138v1-abstract-full" style="display: none;"> This paper presents an investigation of perceptual similarity between music tracks focusing on each individual instrumental part based on a large-scale listening test towards developing an instrumental-part-based music retrieval. In the listening test, 586 subjects evaluate the perceptual similarity of the audio tracks through an ABX test. We use the music tracks and their stems in the test set of the slakh2100 dataset. The perceptual similarity is evaluated based on four perspectives: timbre, rhythm, melody, and overall. We have analyzed the results of the listening test and have found that 1) perceptual music similarity varies depending on which instrumental part is focused on within each track; 2) rhythm and melody tend to have a larger impact on the perceptual music similarity than timbre except for the melody of drums; and 3) the previously proposed music similarity features tend to capture the perceptual similarity on timbre mainly. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02138v1-abstract-full').style.display = 'none'; document.getElementById('2502.02138v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted ICASSP 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06807">arXiv:2411.06807</a> <span> [<a href="https://arxiv.org/pdf/2411.06807">pdf</a>, <a href="https://arxiv.org/format/2411.06807">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Wavehax: Aliasing-Free Neural Waveform Synthesis Based on 2D Convolution and Harmonic Prior for Reliable Complex Spectrogram Estimation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yoneyama%2C+R">Reo Yoneyama</a>, <a href="/search/cs?searchtype=author&query=Miyashita%2C+A">Atsushi Miyashita</a>, <a href="/search/cs?searchtype=author&query=Yamamoto%2C+R">Ryuichi Yamamoto</a>, <a href="/search/cs?searchtype=author&query=Toda%2C+T">Tomoki Toda</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06807v1-abstract-short" style="display: inline;"> Neural vocoders often struggle with aliasing in latent feature spaces, caused by time-domain nonlinear operations and resampling layers. Aliasing folds high-frequency components into the low-frequency range, making aliased and original frequency components indistinguishable and introducing two practical issues. First, aliasing complicates the waveform generation process, as the subsequent layers m… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06807v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06807v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06807v1-abstract-full" style="display: none;"> Neural vocoders often struggle with aliasing in latent feature spaces, caused by time-domain nonlinear operations and resampling layers. Aliasing folds high-frequency components into the low-frequency range, making aliased and original frequency components indistinguishable and introducing two practical issues. First, aliasing complicates the waveform generation process, as the subsequent layers must address these aliasing effects, increasing the computational complexity. Second, it limits extrapolation performance, particularly in handling high fundamental frequencies, which degrades the perceptual quality of generated speech waveforms. This paper demonstrates that 1) time-domain nonlinear operations inevitably introduce aliasing but provide a strong inductive bias for harmonic generation, and 2) time-frequency-domain processing can achieve aliasing-free waveform synthesis but lacks the inductive bias for effective harmonic generation. Building on this insight, we propose Wavehax, an aliasing-free neural WAVEform generator that integrates 2D convolution and a HArmonic prior for reliable Complex Spectrogram estimation. Experimental results show that Wavehax achieves speech quality comparable to existing high-fidelity neural vocoders and exhibits exceptional robustness in scenarios requiring high fundamental frequency extrapolation, where aliasing effects become typically severe. Moreover, Wavehax requires less than 5% of the multiply-accumulate operations and model parameters compared to HiFi-GAN V1, while achieving over four times faster CPU inference speed. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06807v1-abstract-full').style.display = 'none'; document.getElementById('2411.06807v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 5 figures, Submitted to IEEE/ACM Trans. ASLP</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03715">arXiv:2411.03715</a> <span> [<a href="https://arxiv.org/pdf/2411.03715">pdf</a>, <a href="https://arxiv.org/format/2411.03715">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> MOS-Bench: Benchmarking Generalization Abilities of Subjective Speech Quality Assessment Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+W">Wen-Chin Huang</a>, <a href="/search/cs?searchtype=author&query=Cooper%2C+E">Erica Cooper</a>, <a href="/search/cs?searchtype=author&query=Toda%2C+T">Tomoki Toda</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03715v1-abstract-short" style="display: inline;"> Subjective speech quality assessment (SSQA) is critical for evaluating speech samples as perceived by human listeners. While model-based SSQA has enjoyed great success thanks to the development of deep neural networks (DNNs), generalization remains a key challenge, especially for unseen, out-of-domain data. To benchmark the generalization abilities of SSQA models, we present MOS-Bench, a diverse c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03715v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03715v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03715v1-abstract-full" style="display: none;"> Subjective speech quality assessment (SSQA) is critical for evaluating speech samples as perceived by human listeners. While model-based SSQA has enjoyed great success thanks to the development of deep neural networks (DNNs), generalization remains a key challenge, especially for unseen, out-of-domain data. To benchmark the generalization abilities of SSQA models, we present MOS-Bench, a diverse collection of datasets. In addition, we also introduce SHEET, an open-source toolkit containing complete recipes to conduct SSQA experiments. We provided benchmark results for MOS-Bench, and we also explored multi-dataset training to enhance generalization. Additionally, we proposed a new performance metric, best score difference/ratio, and used latent space visualizations to explain model behavior, offering valuable insights for future research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03715v1-abstract-full').style.display = 'none'; document.getElementById('2411.03715v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to Transactions on Audio, Speech and Language Processing. This work has been submitted to the IEEE for possible publication</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.19614">arXiv:2409.19614</a> <span> [<a href="https://arxiv.org/pdf/2409.19614">pdf</a>, <a href="https://arxiv.org/format/2409.19614">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Improved Architecture for High-resolution Piano Transcription to Efficiently Capture Acoustic Characteristics of Music Signals </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mi%2C+J">Jinyi Mi</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S">Sehun Kim</a>, <a href="/search/cs?searchtype=author&query=Toda%2C+T">Tomoki Toda</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.19614v1-abstract-short" style="display: inline;"> Automatic music transcription (AMT), aiming to convert musical signals into musical notation, is one of the important tasks in music information retrieval. Recently, previous works have applied high-resolution labels, i.e., the continuous onset and offset times of piano notes, as training targets, achieving substantial improvements in transcription performance. However, there still remain some iss… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19614v1-abstract-full').style.display = 'inline'; document.getElementById('2409.19614v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.19614v1-abstract-full" style="display: none;"> Automatic music transcription (AMT), aiming to convert musical signals into musical notation, is one of the important tasks in music information retrieval. Recently, previous works have applied high-resolution labels, i.e., the continuous onset and offset times of piano notes, as training targets, achieving substantial improvements in transcription performance. However, there still remain some issues to be addressed, e.g., the harmonics of notes are sometimes recognized as false positive notes, and the size of AMT model tends to be larger to improve the transcription performance. To address these issues, we propose an improved high-resolution piano transcription model to well capture specific acoustic characteristics of music signals. First, we employ the Constant-Q Transform as the input representation to better adapt to musical signals. Moreover, we have designed two architectures: the first is based on a convolutional recurrent neural network (CRNN) with dilated convolution, and the second is an encoder-decoder architecture that combines CRNN with a non-autoregressive Transformer decoder. We conduct systematic experiments for our models. Compared to the high-resolution AMT system used as a baseline, our models effectively achieve 1) consistent improvement in note-level metrics, and 2) the significant smaller model size, which shed lights on future work. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19614v1-abstract-full').style.display = 'none'; document.getElementById('2409.19614v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to APSIPA ASC 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.19585">arXiv:2409.19585</a> <span> [<a href="https://arxiv.org/pdf/2409.19585">pdf</a>, <a href="https://arxiv.org/format/2409.19585">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Two-stage Framework for Robust Speech Emotion Recognition Using Target Speaker Extraction in Human Speech Noise Conditions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mi%2C+J">Jinyi Mi</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+X">Xiaohan Shi</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+D">Ding Ma</a>, <a href="/search/cs?searchtype=author&query=He%2C+J">Jiajun He</a>, <a href="/search/cs?searchtype=author&query=Fujimura%2C+T">Takuya Fujimura</a>, <a href="/search/cs?searchtype=author&query=Toda%2C+T">Tomoki Toda</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.19585v2-abstract-short" style="display: inline;"> Developing a robust speech emotion recognition (SER) system in noisy conditions faces challenges posed by different noise properties. Most previous studies have not considered the impact of human speech noise, thus limiting the application scope of SER. In this paper, we propose a novel two-stage framework for the problem by cascading target speaker extraction (TSE) method and SER. We first train… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19585v2-abstract-full').style.display = 'inline'; document.getElementById('2409.19585v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.19585v2-abstract-full" style="display: none;"> Developing a robust speech emotion recognition (SER) system in noisy conditions faces challenges posed by different noise properties. Most previous studies have not considered the impact of human speech noise, thus limiting the application scope of SER. In this paper, we propose a novel two-stage framework for the problem by cascading target speaker extraction (TSE) method and SER. We first train a TSE model to extract the speech of target speaker from a mixture. Then, in the second stage, we utilize the extracted speech for SER training. Additionally, we explore a joint training of TSE and SER models in the second stage. Our developed system achieves a 14.33% improvement in unweighted accuracy (UA) compared to a baseline without using TSE method, demonstrating the effectiveness of our framework in mitigating the impact of human speech noise. Moreover, we conduct experiments considering speaker gender, showing that our framework performs particularly well in different-gender mixture. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19585v2-abstract-full').style.display = 'none'; document.getElementById('2409.19585v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This is the preprint version of the paper accepted at APSIPA ASC 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.09332">arXiv:2409.09332</a> <span> [<a href="https://arxiv.org/pdf/2409.09332">pdf</a>, <a href="https://arxiv.org/format/2409.09332">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Improvements of Discriminative Feature Space Training for Anomalous Sound Detection in Unlabeled Conditions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fujimura%2C+T">Takuya Fujimura</a>, <a href="/search/cs?searchtype=author&query=Kuroyanagi%2C+I">Ibuki Kuroyanagi</a>, <a href="/search/cs?searchtype=author&query=Toda%2C+T">Tomoki Toda</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.09332v1-abstract-short" style="display: inline;"> In anomalous sound detection, the discriminative method has demonstrated superior performance. This approach constructs a discriminative feature space through the classification of the meta-information labels for normal sounds. This feature space reflects the differences in machine sounds and effectively captures anomalous sounds. However, its performance significantly degrades when the meta-infor… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09332v1-abstract-full').style.display = 'inline'; document.getElementById('2409.09332v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.09332v1-abstract-full" style="display: none;"> In anomalous sound detection, the discriminative method has demonstrated superior performance. This approach constructs a discriminative feature space through the classification of the meta-information labels for normal sounds. This feature space reflects the differences in machine sounds and effectively captures anomalous sounds. However, its performance significantly degrades when the meta-information labels are missing. In this paper, we improve the performance of a discriminative method under unlabeled conditions by two approaches. First, we enhance the feature extractor to perform better under unlabeled conditions. Our enhanced feature extractor utilizes multi-resolution spectrograms with a new training strategy. Second, we propose various pseudo-labeling methods to effectively train the feature extractor. The experimental evaluations show that the proposed feature extractor and pseudo-labeling methods significantly improve performance under unlabeled conditions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09332v1-abstract-full').style.display = 'none'; document.getElementById('2409.09332v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to ICASSP2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.07001">arXiv:2409.07001</a> <span> [<a href="https://arxiv.org/pdf/2409.07001">pdf</a>, <a href="https://arxiv.org/format/2409.07001">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> The VoiceMOS Challenge 2024: Beyond Speech Quality Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+W">Wen-Chin Huang</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+S">Szu-Wei Fu</a>, <a href="/search/cs?searchtype=author&query=Cooper%2C+E">Erica Cooper</a>, <a href="/search/cs?searchtype=author&query=Zezario%2C+R+E">Ryandhimas E. Zezario</a>, <a href="/search/cs?searchtype=author&query=Toda%2C+T">Tomoki Toda</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hsin-Min Wang</a>, <a href="/search/cs?searchtype=author&query=Yamagishi%2C+J">Junichi Yamagishi</a>, <a href="/search/cs?searchtype=author&query=Tsao%2C+Y">Yu Tsao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.07001v1-abstract-short" style="display: inline;"> We present the third edition of the VoiceMOS Challenge, a scientific initiative designed to advance research into automatic prediction of human speech ratings. There were three tracks. The first track was on predicting the quality of ``zoomed-in'' high-quality samples from speech synthesis systems. The second track was to predict ratings of samples from singing voice synthesis and voice conversion… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.07001v1-abstract-full').style.display = 'inline'; document.getElementById('2409.07001v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.07001v1-abstract-full" style="display: none;"> We present the third edition of the VoiceMOS Challenge, a scientific initiative designed to advance research into automatic prediction of human speech ratings. There were three tracks. The first track was on predicting the quality of ``zoomed-in'' high-quality samples from speech synthesis systems. The second track was to predict ratings of samples from singing voice synthesis and voice conversion with a large variety of systems, listeners, and languages. The third track was semi-supervised quality prediction for noisy, clean, and enhanced speech, where a very small amount of labeled training data was provided. Among the eight teams from both academia and industry, we found that many were able to outperform the baseline systems. Successful techniques included retrieval-based methods and the use of non-self-supervised representations like spectrograms and pitch histograms. These results showed that the challenge has advanced the field of subjective speech rating prediction. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.07001v1-abstract-full').style.display = 'none'; document.getElementById('2409.07001v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to SLT2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.16132">arXiv:2408.16132</a> <span> [<a href="https://arxiv.org/pdf/2408.16132">pdf</a>, <a href="https://arxiv.org/format/2408.16132">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> SVDD 2024: The Inaugural Singing Voice Deepfake Detection Challenge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">You Zhang</a>, <a href="/search/cs?searchtype=author&query=Zang%2C+Y">Yongyi Zang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+J">Jiatong Shi</a>, <a href="/search/cs?searchtype=author&query=Yamamoto%2C+R">Ryuichi Yamamoto</a>, <a href="/search/cs?searchtype=author&query=Toda%2C+T">Tomoki Toda</a>, <a href="/search/cs?searchtype=author&query=Duan%2C+Z">Zhiyao Duan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.16132v2-abstract-short" style="display: inline;"> With the advancements in singing voice generation and the growing presence of AI singers on media platforms, the inaugural Singing Voice Deepfake Detection (SVDD) Challenge aims to advance research in identifying AI-generated singing voices from authentic singers. This challenge features two tracks: a controlled setting track (CtrSVDD) and an in-the-wild scenario track (WildSVDD). The CtrSVDD trac… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.16132v2-abstract-full').style.display = 'inline'; document.getElementById('2408.16132v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.16132v2-abstract-full" style="display: none;"> With the advancements in singing voice generation and the growing presence of AI singers on media platforms, the inaugural Singing Voice Deepfake Detection (SVDD) Challenge aims to advance research in identifying AI-generated singing voices from authentic singers. This challenge features two tracks: a controlled setting track (CtrSVDD) and an in-the-wild scenario track (WildSVDD). The CtrSVDD track utilizes publicly available singing vocal data to generate deepfakes using state-of-the-art singing voice synthesis and conversion systems. Meanwhile, the WildSVDD track expands upon the existing SingFake dataset, which includes data sourced from popular user-generated content websites. For the CtrSVDD track, we received submissions from 47 teams, with 37 surpassing our baselines and the top team achieving a 1.65% equal error rate. For the WildSVDD track, we benchmarked the baselines. This paper reviews these results, discusses key findings, and outlines future directions for SVDD research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.16132v2-abstract-full').style.display = 'none'; document.getElementById('2408.16132v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">6 pages, Accepted by 2024 IEEE Spoken Language Technology Workshop (SLT 2024)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.06208">arXiv:2406.06208</a> <span> [<a href="https://arxiv.org/pdf/2406.06208">pdf</a>, <a href="https://arxiv.org/format/2406.06208">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Quantifying the effect of speech pathology on automatic and human speaker verification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Halpern%2C+B+M">Bence Mark Halpern</a>, <a href="/search/cs?searchtype=author&query=Tienkamp%2C+T">Thomas Tienkamp</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+W">Wen-Chin Huang</a>, <a href="/search/cs?searchtype=author&query=Violeta%2C+L+P">Lester Phillip Violeta</a>, <a href="/search/cs?searchtype=author&query=Rebernik%2C+T">Teja Rebernik</a>, <a href="/search/cs?searchtype=author&query=de+Visscher%2C+S">Sebastiaan de Visscher</a>, <a href="/search/cs?searchtype=author&query=Witjes%2C+M">Max Witjes</a>, <a href="/search/cs?searchtype=author&query=Wieling%2C+M">Martijn Wieling</a>, <a href="/search/cs?searchtype=author&query=Abur%2C+D">Defne Abur</a>, <a href="/search/cs?searchtype=author&query=Toda%2C+T">Tomoki Toda</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.06208v1-abstract-short" style="display: inline;"> This study investigates how surgical intervention for speech pathology (specifically, as a result of oral cancer surgery) impacts the performance of an automatic speaker verification (ASV) system. Using two recently collected Dutch datasets with parallel pre and post-surgery audio from the same speaker, NKI-OC-VC and SPOKE, we assess the extent to which speech pathology influences ASV performance,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.06208v1-abstract-full').style.display = 'inline'; document.getElementById('2406.06208v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.06208v1-abstract-full" style="display: none;"> This study investigates how surgical intervention for speech pathology (specifically, as a result of oral cancer surgery) impacts the performance of an automatic speaker verification (ASV) system. Using two recently collected Dutch datasets with parallel pre and post-surgery audio from the same speaker, NKI-OC-VC and SPOKE, we assess the extent to which speech pathology influences ASV performance, and whether objective/subjective measures of speech severity are correlated with the performance. Finally, we carry out a perceptual study to compare judgements of ASV and human listeners. Our findings reveal that pathological speech negatively affects ASV performance, and the severity of the speech is negatively correlated with the performance. There is a moderate agreement in perceptual and objective scores of speaker similarity and severity, however, we could not clearly establish in the perceptual study, whether the same phenomenon also exists in human perception. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.06208v1-abstract-full').style.display = 'none'; document.getElementById('2406.06208v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 2 figures, 2 tables. Accepted to Interspeech 2024</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.7 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.06201">arXiv:2406.06201</a> <span> [<a href="https://arxiv.org/pdf/2406.06201">pdf</a>, <a href="https://arxiv.org/format/2406.06201">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> 2DP-2MRC: 2-Dimensional Pointer-based Machine Reading Comprehension Method for Multimodal Moment Retrieval </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+J">Jiajun He</a>, <a href="/search/cs?searchtype=author&query=Toda%2C+T">Tomoki Toda</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.06201v1-abstract-short" style="display: inline;"> Moment retrieval aims to locate the most relevant moment in an untrimmed video based on a given natural language query. Existing solutions can be roughly categorized into moment-based and clip-based methods. The former often involves heavy computations, while the latter, due to overlooking coarse-grained information, typically underperforms compared to moment-based models. Hence, this paper propos… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.06201v1-abstract-full').style.display = 'inline'; document.getElementById('2406.06201v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.06201v1-abstract-full" style="display: none;"> Moment retrieval aims to locate the most relevant moment in an untrimmed video based on a given natural language query. Existing solutions can be roughly categorized into moment-based and clip-based methods. The former often involves heavy computations, while the latter, due to overlooking coarse-grained information, typically underperforms compared to moment-based models. Hence, this paper proposes a novel 2-Dimensional Pointer-based Machine Reading Comprehension for Moment Retrieval Choice (2DP-2MRC) model to address the issue of imprecise localization in clip-based methods while maintaining lower computational complexity than moment-based methods. Specifically, we introduce an AV-Encoder to capture coarse-grained information at moment and video levels. Additionally, a 2D pointer encoder module is introduced to further enhance boundary detection for target moment. Extensive experiments on the HiREST dataset demonstrate that 2DP-2MRC significantly outperforms existing baseline models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.06201v1-abstract-full').style.display = 'none'; document.getElementById('2406.06201v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by INTERSPEECH 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.02438">arXiv:2406.02438</a> <span> [<a href="https://arxiv.org/pdf/2406.02438">pdf</a>, <a href="https://arxiv.org/format/2406.02438">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.21437/Interspeech.2024-2242">10.21437/Interspeech.2024-2242 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> CtrSVDD: A Benchmark Dataset and Baseline Analysis for Controlled Singing Voice Deepfake Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zang%2C+Y">Yongyi Zang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+J">Jiatong Shi</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">You Zhang</a>, <a href="/search/cs?searchtype=author&query=Yamamoto%2C+R">Ryuichi Yamamoto</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Jionghao Han</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Y">Yuxun Tang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+S">Shengyuan Xu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+W">Wenxiao Zhao</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+J">Jing Guo</a>, <a href="/search/cs?searchtype=author&query=Toda%2C+T">Tomoki Toda</a>, <a href="/search/cs?searchtype=author&query=Duan%2C+Z">Zhiyao Duan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.02438v2-abstract-short" style="display: inline;"> Recent singing voice synthesis and conversion advancements necessitate robust singing voice deepfake detection (SVDD) models. Current SVDD datasets face challenges due to limited controllability, diversity in deepfake methods, and licensing restrictions. Addressing these gaps, we introduce CtrSVDD, a large-scale, diverse collection of bonafide and deepfake singing vocals. These vocals are synthesi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.02438v2-abstract-full').style.display = 'inline'; document.getElementById('2406.02438v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.02438v2-abstract-full" style="display: none;"> Recent singing voice synthesis and conversion advancements necessitate robust singing voice deepfake detection (SVDD) models. Current SVDD datasets face challenges due to limited controllability, diversity in deepfake methods, and licensing restrictions. Addressing these gaps, we introduce CtrSVDD, a large-scale, diverse collection of bonafide and deepfake singing vocals. These vocals are synthesized using state-of-the-art methods from publicly accessible singing voice datasets. CtrSVDD includes 47.64 hours of bonafide and 260.34 hours of deepfake singing vocals, spanning 14 deepfake methods and involving 164 singer identities. We also present a baseline system with flexible front-end features, evaluated against a structured train/dev/eval split. The experiments show the importance of feature selection and highlight a need for generalization towards deepfake methods that deviate further from training distribution. The CtrSVDD dataset and baselines are publicly accessible. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.02438v2-abstract-full').style.display = 'none'; document.getElementById('2406.02438v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by Interspeech 2024</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Proceedings of Interspeech 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.11767">arXiv:2405.11767</a> <span> [<a href="https://arxiv.org/pdf/2405.11767">pdf</a>, <a href="https://arxiv.org/format/2405.11767">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Multi-speaker Text-to-speech Training with Speaker Anonymized Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+W">Wen-Chin Huang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yi-Chiao Wu</a>, <a href="/search/cs?searchtype=author&query=Toda%2C+T">Tomoki Toda</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.11767v1-abstract-short" style="display: inline;"> The trend of scaling up speech generation models poses a threat of biometric information leakage of the identities of the voices in the training data, raising privacy and security concerns. In this paper, we investigate training multi-speaker text-to-speech (TTS) models using data that underwent speaker anonymization (SA), a process that tends to hide the speaker identity of the input speech while… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.11767v1-abstract-full').style.display = 'inline'; document.getElementById('2405.11767v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.11767v1-abstract-full" style="display: none;"> The trend of scaling up speech generation models poses a threat of biometric information leakage of the identities of the voices in the training data, raising privacy and security concerns. In this paper, we investigate training multi-speaker text-to-speech (TTS) models using data that underwent speaker anonymization (SA), a process that tends to hide the speaker identity of the input speech while maintaining other attributes. Two signal processing-based and three deep neural network-based SA methods were used to anonymize VCTK, a multi-speaker TTS dataset, which is further used to train an end-to-end TTS model, VITS, to perform unseen speaker TTS during the testing phase. We conducted extensive objective and subjective experiments to evaluate the anonymized training data, as well as the performance of the downstream TTS model trained using those data. Importantly, we found that UTMOS, a data-driven subjective rating predictor model, and GVD, a metric that measures the gain of voice distinctiveness, are good indicators of the downstream TTS performance. We summarize insights in the hope of helping future researchers determine the goodness of the SA system for multi-speaker TTS training. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.11767v1-abstract-full').style.display = 'none'; document.getElementById('2405.11767v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages. Submitted to Signal Processing Letters. Audio sample page: https://unilight.github.io/Publication-Demos/publications/sa-tts-spl/index.html</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.05244">arXiv:2405.05244</a> <span> [<a href="https://arxiv.org/pdf/2405.05244">pdf</a>, <a href="https://arxiv.org/format/2405.05244">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> SVDD Challenge 2024: A Singing Voice Deepfake Detection Challenge Evaluation Plan </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">You Zhang</a>, <a href="/search/cs?searchtype=author&query=Zang%2C+Y">Yongyi Zang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+J">Jiatong Shi</a>, <a href="/search/cs?searchtype=author&query=Yamamoto%2C+R">Ryuichi Yamamoto</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Jionghao Han</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Y">Yuxun Tang</a>, <a href="/search/cs?searchtype=author&query=Toda%2C+T">Tomoki Toda</a>, <a href="/search/cs?searchtype=author&query=Duan%2C+Z">Zhiyao Duan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.05244v1-abstract-short" style="display: inline;"> The rapid advancement of AI-generated singing voices, which now closely mimic natural human singing and align seamlessly with musical scores, has led to heightened concerns for artists and the music industry. Unlike spoken voice, singing voice presents unique challenges due to its musical nature and the presence of strong background music, making singing voice deepfake detection (SVDD) a specializ… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.05244v1-abstract-full').style.display = 'inline'; document.getElementById('2405.05244v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.05244v1-abstract-full" style="display: none;"> The rapid advancement of AI-generated singing voices, which now closely mimic natural human singing and align seamlessly with musical scores, has led to heightened concerns for artists and the music industry. Unlike spoken voice, singing voice presents unique challenges due to its musical nature and the presence of strong background music, making singing voice deepfake detection (SVDD) a specialized field requiring focused attention. To promote SVDD research, we recently proposed the "SVDD Challenge," the very first research challenge focusing on SVDD for lab-controlled and in-the-wild bonafide and deepfake singing voice recordings. The challenge will be held in conjunction with the 2024 IEEE Spoken Language Technology Workshop (SLT 2024). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.05244v1-abstract-full').style.display = 'none'; document.getElementById('2405.05244v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Evaluation plan of the SVDD Challenge @ SLT 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.06682">arXiv:2404.06682</a> <span> [<a href="https://arxiv.org/pdf/2404.06682">pdf</a>, <a href="https://arxiv.org/format/2404.06682">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Learning Multidimensional Disentangled Representations of Instrumental Sounds for Musical Similarity Assessment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hashizume%2C+Y">Yuka Hashizume</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Li Li</a>, <a href="/search/cs?searchtype=author&query=Miyashita%2C+A">Atsushi Miyashita</a>, <a href="/search/cs?searchtype=author&query=Toda%2C+T">Tomoki Toda</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.06682v1-abstract-short" style="display: inline;"> To achieve a flexible recommendation and retrieval system, it is desirable to calculate music similarity by focusing on multiple partial elements of musical pieces and allowing the users to select the element they want to focus on. A previous study proposed using multiple individual networks for calculating music similarity based on each instrumental sound, but it is impractical to use each signal… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.06682v1-abstract-full').style.display = 'inline'; document.getElementById('2404.06682v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.06682v1-abstract-full" style="display: none;"> To achieve a flexible recommendation and retrieval system, it is desirable to calculate music similarity by focusing on multiple partial elements of musical pieces and allowing the users to select the element they want to focus on. A previous study proposed using multiple individual networks for calculating music similarity based on each instrumental sound, but it is impractical to use each signal as a query in search systems. Using separated instrumental sounds alternatively resulted in less accuracy due to artifacts. In this paper, we propose a method to compute similarities focusing on each instrumental sound with a single network that takes mixed sounds as input instead of individual instrumental sounds. Specifically, we design a single similarity embedding space with disentangled dimensions for each instrument, extracted by Conditional Similarity Networks, which is trained by the triplet loss using masks. Experimental results have shown that (1) the proposed method can obtain more accurate feature representation than using individual networks using separated sounds as input, (2) each sub-embedding space can hold the characteristics of the corresponding instrument, and (3) the selection of similar musical pieces focusing on each instrumental sound by the proposed method can obtain human consent, especially in drums and guitar. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.06682v1-abstract-full').style.display = 'none'; document.getElementById('2404.06682v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.06100">arXiv:2403.06100</a> <span> [<a href="https://arxiv.org/pdf/2403.06100">pdf</a>, <a href="https://arxiv.org/format/2403.06100">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Automatic design optimization of preference-based subjective evaluation with online learning in crowdsourcing environment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yasuda%2C+Y">Yusuke Yasuda</a>, <a href="/search/cs?searchtype=author&query=Toda%2C+T">Tomoki Toda</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.06100v1-abstract-short" style="display: inline;"> A preference-based subjective evaluation is a key method for evaluating generative media reliably. However, its huge combinations of pairs prohibit it from being applied to large-scale evaluation using crowdsourcing. To address this issue, we propose an automatic optimization method for preference-based subjective evaluation in terms of pair combination selections and allocation of evaluation volu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.06100v1-abstract-full').style.display = 'inline'; document.getElementById('2403.06100v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.06100v1-abstract-full" style="display: none;"> A preference-based subjective evaluation is a key method for evaluating generative media reliably. However, its huge combinations of pairs prohibit it from being applied to large-scale evaluation using crowdsourcing. To address this issue, we propose an automatic optimization method for preference-based subjective evaluation in terms of pair combination selections and allocation of evaluation volumes with online learning in a crowdsourcing environment. We use a preference-based online learning method based on a sorting algorithm to identify the total order of evaluation targets with minimum sample volumes. Our online learning algorithm supports parallel and asynchronous execution under fixed-budget conditions required for crowdsourcing. Our experiment on preference-based subjective evaluation of synthetic speech shows that our method successfully optimizes the test by reducing pair combinations from 351 to 83 and allocating optimal evaluation volumes for each pair ranging from 30 to 663 without compromising evaluation accuracies and wasting budget allocations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.06100v1-abstract-full').style.display = 'none'; document.getElementById('2403.06100v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.13260">arXiv:2401.13260</a> <span> [<a href="https://arxiv.org/pdf/2401.13260">pdf</a>, <a href="https://arxiv.org/format/2401.13260">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> MF-AED-AEC: Speech Emotion Recognition by Leveraging Multimodal Fusion, Asr Error Detection, and Asr Error Correction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+J">Jiajun He</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+X">Xiaohan Shi</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xingfeng Li</a>, <a href="/search/cs?searchtype=author&query=Toda%2C+T">Tomoki Toda</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.13260v2-abstract-short" style="display: inline;"> The prevalent approach in speech emotion recognition (SER) involves integrating both audio and textual information to comprehensively identify the speaker's emotion, with the text generally obtained through automatic speech recognition (ASR). An essential issue of this approach is that ASR errors from the text modality can worsen the performance of SER. Previous studies have proposed using an auxi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.13260v2-abstract-full').style.display = 'inline'; document.getElementById('2401.13260v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.13260v2-abstract-full" style="display: none;"> The prevalent approach in speech emotion recognition (SER) involves integrating both audio and textual information to comprehensively identify the speaker's emotion, with the text generally obtained through automatic speech recognition (ASR). An essential issue of this approach is that ASR errors from the text modality can worsen the performance of SER. Previous studies have proposed using an auxiliary ASR error detection task to adaptively assign weights of each word in ASR hypotheses. However, this approach has limited improvement potential because it does not address the coherence of semantic information in the text. Additionally, the inherent heterogeneity of different modalities leads to distribution gaps between their representations, making their fusion challenging. Therefore, in this paper, we incorporate two auxiliary tasks, ASR error detection (AED) and ASR error correction (AEC), to enhance the semantic coherence of ASR text, and further introduce a novel multi-modal fusion (MF) method to learn shared representations across modalities. We refer to our method as MF-AED-AEC. Experimental results indicate that MF-AED-AEC significantly outperforms the baseline model by a margin of 4.1\%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.13260v2-abstract-full').style.display = 'none'; document.getElementById('2401.13260v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICASSP 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.07093">arXiv:2311.07093</a> <span> [<a href="https://arxiv.org/pdf/2311.07093">pdf</a>, <a href="https://arxiv.org/format/2311.07093">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> On the Effectiveness of ASR Representations in Real-world Noisy Speech Emotion Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+X">Xiaohan Shi</a>, <a href="/search/cs?searchtype=author&query=He%2C+J">Jiajun He</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xingfeng Li</a>, <a href="/search/cs?searchtype=author&query=Toda%2C+T">Tomoki Toda</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.07093v3-abstract-short" style="display: inline;"> This paper proposes an efficient attempt to noisy speech emotion recognition (NSER). Conventional NSER approaches have proven effective in mitigating the impact of artificial noise sources, such as white Gaussian noise, but are limited to non-stationary noises in real-world environments due to their complexity and uncertainty. To overcome this limitation, we introduce a new method for NSER by adop… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.07093v3-abstract-full').style.display = 'inline'; document.getElementById('2311.07093v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.07093v3-abstract-full" style="display: none;"> This paper proposes an efficient attempt to noisy speech emotion recognition (NSER). Conventional NSER approaches have proven effective in mitigating the impact of artificial noise sources, such as white Gaussian noise, but are limited to non-stationary noises in real-world environments due to their complexity and uncertainty. To overcome this limitation, we introduce a new method for NSER by adopting the automatic speech recognition (ASR) model as a noise-robust feature extractor to eliminate non-vocal information in noisy speech. We first obtain intermediate layer information from the ASR model as a feature representation for emotional speech and then apply this representation for the downstream NSER task. Our experimental results show that 1) the proposed method achieves better NSER performance compared with the conventional noise reduction method, 2) outperforms self-supervised learning approaches, and 3) even outperforms text-based approaches using ASR transcription or the ground truth transcription of noisy speech. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.07093v3-abstract-full').style.display = 'none'; document.getElementById('2311.07093v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to IEEE/ACM Transactions on Audio, Speech, and Language Processing</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.05203">arXiv:2310.05203</a> <span> [<a href="https://arxiv.org/pdf/2310.05203">pdf</a>, <a href="https://arxiv.org/format/2310.05203">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> A Comparative Study of Voice Conversion Models with Large-Scale Speech and Singing Data: The T13 Systems for the Singing Voice Conversion Challenge 2023 </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yamamoto%2C+R">Ryuichi Yamamoto</a>, <a href="/search/cs?searchtype=author&query=Yoneyama%2C+R">Reo Yoneyama</a>, <a href="/search/cs?searchtype=author&query=Violeta%2C+L+P">Lester Phillip Violeta</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+W">Wen-Chin Huang</a>, <a href="/search/cs?searchtype=author&query=Toda%2C+T">Tomoki Toda</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.05203v1-abstract-short" style="display: inline;"> This paper presents our systems (denoted as T13) for the singing voice conversion challenge (SVCC) 2023. For both in-domain and cross-domain English singing voice conversion (SVC) tasks (Task 1 and Task 2), we adopt a recognition-synthesis approach with self-supervised learning-based representation. To achieve data-efficient SVC with a limited amount of target singer/speaker's data (150 to 160 utt… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.05203v1-abstract-full').style.display = 'inline'; document.getElementById('2310.05203v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.05203v1-abstract-full" style="display: none;"> This paper presents our systems (denoted as T13) for the singing voice conversion challenge (SVCC) 2023. For both in-domain and cross-domain English singing voice conversion (SVC) tasks (Task 1 and Task 2), we adopt a recognition-synthesis approach with self-supervised learning-based representation. To achieve data-efficient SVC with a limited amount of target singer/speaker's data (150 to 160 utterances for SVCC 2023), we first train a diffusion-based any-to-any voice conversion model using publicly available large-scale 750 hours of speech and singing data. Then, we finetune the model for each target singer/speaker of Task 1 and Task 2. Large-scale listening tests conducted by SVCC 2023 show that our T13 system achieves competitive naturalness and speaker similarity for the harder cross-domain SVC (Task 2), which implies the generalization ability of our proposed method. Our objective evaluation results show that using large datasets is particularly beneficial for cross-domain SVC. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.05203v1-abstract-full').style.display = 'none'; document.getElementById('2310.05203v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ASRU 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.05129">arXiv:2310.05129</a> <span> [<a href="https://arxiv.org/pdf/2310.05129">pdf</a>, <a href="https://arxiv.org/format/2310.05129">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> ed-cec: improving rare word recognition using asr postprocessing based on error detection and context-aware error correction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+J">Jiajun He</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zekun Yang</a>, <a href="/search/cs?searchtype=author&query=Toda%2C+T">Tomoki Toda</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.05129v1-abstract-short" style="display: inline;"> Automatic speech recognition (ASR) systems often encounter difficulties in accurately recognizing rare words, leading to errors that can have a negative impact on downstream tasks such as keyword spotting, intent detection, and text summarization. To address this challenge, we present a novel ASR postprocessing method that focuses on improving the recognition of rare words through error detection… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.05129v1-abstract-full').style.display = 'inline'; document.getElementById('2310.05129v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.05129v1-abstract-full" style="display: none;"> Automatic speech recognition (ASR) systems often encounter difficulties in accurately recognizing rare words, leading to errors that can have a negative impact on downstream tasks such as keyword spotting, intent detection, and text summarization. To address this challenge, we present a novel ASR postprocessing method that focuses on improving the recognition of rare words through error detection and context-aware error correction. Our method optimizes the decoding process by targeting only the predicted error positions, minimizing unnecessary computations. Moreover, we leverage a rare word list to provide additional contextual knowledge, enabling the model to better correct rare words. Experimental results across five datasets demonstrate that our proposed method achieves significantly lower word error rates (WERs) than previous approaches while maintaining a reasonable inference speed. Furthermore, our approach exhibits promising robustness across different ASR systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.05129v1-abstract-full').style.display = 'none'; document.getElementById('2310.05129v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">6 pages, 5 figures, conference</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.02570">arXiv:2310.02570</a> <span> [<a href="https://arxiv.org/pdf/2310.02570">pdf</a>, <a href="https://arxiv.org/format/2310.02570">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Improving severity preservation of healthy-to-pathological voice conversion with global style tokens </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Halpern%2C+B+M">Bence Mark Halpern</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+W">Wen-Chin Huang</a>, <a href="/search/cs?searchtype=author&query=Violeta%2C+L+P">Lester Phillip Violeta</a>, <a href="/search/cs?searchtype=author&query=van+Son%2C+R+J+J+H">R. J. J. H. van Son</a>, <a href="/search/cs?searchtype=author&query=Toda%2C+T">Tomoki Toda</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.02570v1-abstract-short" style="display: inline;"> In healthy-to-pathological voice conversion (H2P-VC), healthy speech is converted into pathological while preserving the identity. The paper improves on previous two-stage approach to H2P-VC where (1) speech is created first with the appropriate severity, (2) then the speaker identity of the voice is converted while preserving the severity of the voice. Specifically, we propose improvements to (2)… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.02570v1-abstract-full').style.display = 'inline'; document.getElementById('2310.02570v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.02570v1-abstract-full" style="display: none;"> In healthy-to-pathological voice conversion (H2P-VC), healthy speech is converted into pathological while preserving the identity. The paper improves on previous two-stage approach to H2P-VC where (1) speech is created first with the appropriate severity, (2) then the speaker identity of the voice is converted while preserving the severity of the voice. Specifically, we propose improvements to (2) by using phonetic posteriorgrams (PPG) and global style tokens (GST). Furthermore, we present a new dataset that contains parallel recordings of pathological and healthy speakers with the same identity which allows more precise evaluation. Listening tests by expert listeners show that the framework preserves severity of the source sample, while modelling target speaker's voice. We also show that (a) pathology impacts x-vectors but not all speaker information is lost, (b) choosing source speakers based on severity labels alone is insufficient. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.02570v1-abstract-full').style.display = 'none'; document.getElementById('2310.02570v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages, 3 figures, 5 tables. Accepted to IEEE Automatic Speech Recognition and Understanding Workshop 2023</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.7 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.09627">arXiv:2309.09627</a> <span> [<a href="https://arxiv.org/pdf/2309.09627">pdf</a>, <a href="https://arxiv.org/format/2309.09627">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Electrolaryngeal Speech Intelligibility Enhancement Through Robust Linguistic Encoders </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Violeta%2C+L+P">Lester Phillip Violeta</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+W">Wen-Chin Huang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+D">Ding Ma</a>, <a href="/search/cs?searchtype=author&query=Yamamoto%2C+R">Ryuichi Yamamoto</a>, <a href="/search/cs?searchtype=author&query=Kobayashi%2C+K">Kazuhiro Kobayashi</a>, <a href="/search/cs?searchtype=author&query=Toda%2C+T">Tomoki Toda</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.09627v2-abstract-short" style="display: inline;"> We propose a novel framework for electrolaryngeal speech intelligibility enhancement through the use of robust linguistic encoders. Pretraining and fine-tuning approaches have proven to work well in this task, but in most cases, various mismatches, such as the speech type mismatch (electrolaryngeal vs. typical) or a speaker mismatch between the datasets used in each stage, can deteriorate the conv… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.09627v2-abstract-full').style.display = 'inline'; document.getElementById('2309.09627v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.09627v2-abstract-full" style="display: none;"> We propose a novel framework for electrolaryngeal speech intelligibility enhancement through the use of robust linguistic encoders. Pretraining and fine-tuning approaches have proven to work well in this task, but in most cases, various mismatches, such as the speech type mismatch (electrolaryngeal vs. typical) or a speaker mismatch between the datasets used in each stage, can deteriorate the conversion performance of this framework. To resolve this issue, we propose a linguistic encoder robust enough to project both EL and typical speech in the same latent space, while still being able to extract accurate linguistic information, creating a unified representation to reduce the speech type mismatch. Furthermore, we introduce HuBERT output features to the proposed framework for reducing the speaker mismatch, making it possible to effectively use a large-scale parallel dataset during pretraining. We show that compared to the conventional framework using mel-spectrogram input and output features, using the proposed framework enables the model to synthesize more intelligible and naturally sounding speech, as shown by a significant 16% improvement in character error rate and 0.83 improvement in naturalness score. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.09627v2-abstract-full').style.display = 'none'; document.getElementById('2309.09627v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ICASSP 2024. Demo page: lesterphillip.github.io/icassp2024_el_sie</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.08141">arXiv:2309.08141</a> <span> [<a href="https://arxiv.org/pdf/2309.08141">pdf</a>, <a href="https://arxiv.org/format/2309.08141">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Audio Difference Learning for Audio Captioning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Komatsu%2C+T">Tatsuya Komatsu</a>, <a href="/search/cs?searchtype=author&query=Fujita%2C+Y">Yusuke Fujita</a>, <a href="/search/cs?searchtype=author&query=Takeda%2C+K">Kazuya Takeda</a>, <a href="/search/cs?searchtype=author&query=Toda%2C+T">Tomoki Toda</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.08141v1-abstract-short" style="display: inline;"> This study introduces a novel training paradigm, audio difference learning, for improving audio captioning. The fundamental concept of the proposed learning method is to create a feature representation space that preserves the relationship between audio, enabling the generation of captions that detail intricate audio information. This method employs a reference audio along with the input audio, bo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.08141v1-abstract-full').style.display = 'inline'; document.getElementById('2309.08141v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.08141v1-abstract-full" style="display: none;"> This study introduces a novel training paradigm, audio difference learning, for improving audio captioning. The fundamental concept of the proposed learning method is to create a feature representation space that preserves the relationship between audio, enabling the generation of captions that detail intricate audio information. This method employs a reference audio along with the input audio, both of which are transformed into feature representations via a shared encoder. Captions are then generated from these differential features to describe their differences. Furthermore, a unique technique is proposed that involves mixing the input audio with additional audio, and using the additional audio as a reference. This results in the difference between the mixed audio and the reference audio reverting back to the original input audio. This allows the original input's caption to be used as the caption for their difference, eliminating the need for additional annotations for the differences. In the experiments using the Clotho and ESC50 datasets, the proposed method demonstrated an improvement in the SPIDEr score by 7% compared to conventional methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.08141v1-abstract-full').style.display = 'none'; document.getElementById('2309.08141v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">submitted to ICASSP2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.07598">arXiv:2309.07598</a> <span> [<a href="https://arxiv.org/pdf/2309.07598">pdf</a>, <a href="https://arxiv.org/format/2309.07598">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> AAS-VC: On the Generalization Ability of Automatic Alignment Search based Non-autoregressive Sequence-to-sequence Voice Conversion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+W">Wen-Chin Huang</a>, <a href="/search/cs?searchtype=author&query=Kobayashi%2C+K">Kazuhiro Kobayashi</a>, <a href="/search/cs?searchtype=author&query=Toda%2C+T">Tomoki Toda</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.07598v2-abstract-short" style="display: inline;"> Non-autoregressive (non-AR) sequence-to-seqeunce (seq2seq) models for voice conversion (VC) is attractive in its ability to effectively model the temporal structure while enjoying boosted intelligibility and fast inference thanks to non-AR modeling. However, the dependency of current non-AR seq2seq VC models on ground truth durations extracted from an external AR model greatly limits its generaliz… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.07598v2-abstract-full').style.display = 'inline'; document.getElementById('2309.07598v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.07598v2-abstract-full" style="display: none;"> Non-autoregressive (non-AR) sequence-to-seqeunce (seq2seq) models for voice conversion (VC) is attractive in its ability to effectively model the temporal structure while enjoying boosted intelligibility and fast inference thanks to non-AR modeling. However, the dependency of current non-AR seq2seq VC models on ground truth durations extracted from an external AR model greatly limits its generalization ability to smaller training datasets. In this paper, we first demonstrate the above-mentioned problem by varying the training data size. Then, we present AAS-VC, a non-AR seq2seq VC model based on automatic alignment search (AAS), which removes the dependency on external durations and serves as a proper inductive bias to provide the required generalization ability for small datasets. Experimental results show that AAS-VC can generalize better to a training dataset of only 5 minutes. We also conducted ablation studies to justify several model design choices. The audio samples and implementation are available online. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.07598v2-abstract-full').style.display = 'none'; document.getElementById('2309.07598v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to ICASSP 2024. Demo: https://unilight.github.io/Publication-Demos/publications/aas-vc/index.html. Code: https://github.com/unilight/seq2seq-vc</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.02133">arXiv:2309.02133</a> <span> [<a href="https://arxiv.org/pdf/2309.02133">pdf</a>, <a href="https://arxiv.org/format/2309.02133">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Evaluating Methods for Ground-Truth-Free Foreign Accent Conversion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+W">Wen-Chin Huang</a>, <a href="/search/cs?searchtype=author&query=Toda%2C+T">Tomoki Toda</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.02133v1-abstract-short" style="display: inline;"> Foreign accent conversion (FAC) is a special application of voice conversion (VC) which aims to convert the accented speech of a non-native speaker to a native-sounding speech with the same speaker identity. FAC is difficult since the native speech from the desired non-native speaker to be used as the training target is impossible to collect. In this work, we evaluate three recently proposed metho… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.02133v1-abstract-full').style.display = 'inline'; document.getElementById('2309.02133v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.02133v1-abstract-full" style="display: none;"> Foreign accent conversion (FAC) is a special application of voice conversion (VC) which aims to convert the accented speech of a non-native speaker to a native-sounding speech with the same speaker identity. FAC is difficult since the native speech from the desired non-native speaker to be used as the training target is impossible to collect. In this work, we evaluate three recently proposed methods for ground-truth-free FAC, where all of them aim to harness the power of sequence-to-sequence (seq2seq) and non-parallel VC models to properly convert the accent and control the speaker identity. Our experimental evaluation results show that no single method was significantly better than the others in all evaluation axes, which is in contrast to conclusions drawn in previous studies. We also explain the effectiveness of these methods with the training input and output of the seq2seq model and examine the design choice of the non-parallel VC model, and show that intelligibility measures such as word error rates do not correlate well with subjective accentedness. Finally, our implementation is open-sourced to promote reproducible research and help future researchers improve upon the compared systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.02133v1-abstract-full').style.display = 'none'; document.getElementById('2309.02133v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to the 2023 Asia Pacific Signal and Information Processing Association Annual Summit and Conference (APSIPA ASC). Demo page: https://unilight.github.io/Publication-Demos/publications/fac-evaluate. Code: https://github.com/unilight/seq2seq-vc</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.14422">arXiv:2306.14422</a> <span> [<a href="https://arxiv.org/pdf/2306.14422">pdf</a>, <a href="https://arxiv.org/format/2306.14422">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> The Singing Voice Conversion Challenge 2023 </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+W">Wen-Chin Huang</a>, <a href="/search/cs?searchtype=author&query=Violeta%2C+L+P">Lester Phillip Violeta</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Songxiang Liu</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+J">Jiatong Shi</a>, <a href="/search/cs?searchtype=author&query=Toda%2C+T">Tomoki Toda</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.14422v2-abstract-short" style="display: inline;"> We present the latest iteration of the voice conversion challenge (VCC) series, a bi-annual scientific event aiming to compare and understand different voice conversion (VC) systems based on a common dataset. This year we shifted our focus to singing voice conversion (SVC), thus named the challenge the Singing Voice Conversion Challenge (SVCC). A new database was constructed for two tasks, namely… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.14422v2-abstract-full').style.display = 'inline'; document.getElementById('2306.14422v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.14422v2-abstract-full" style="display: none;"> We present the latest iteration of the voice conversion challenge (VCC) series, a bi-annual scientific event aiming to compare and understand different voice conversion (VC) systems based on a common dataset. This year we shifted our focus to singing voice conversion (SVC), thus named the challenge the Singing Voice Conversion Challenge (SVCC). A new database was constructed for two tasks, namely in-domain and cross-domain SVC. The challenge was run for two months, and in total we received 26 submissions, including 2 baselines. Through a large-scale crowd-sourced listening test, we observed that for both tasks, although human-level naturalness was achieved by the top system, no team was able to obtain a similarity score as high as the target speakers. Also, as expected, cross-domain SVC is harder than in-domain SVC, especially in the similarity aspect. We also investigated whether existing objective measurements were able to predict perceptual performance, and found that only few of them could reach a significant correlation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.14422v2-abstract-full').style.display = 'none'; document.getElementById('2306.14422v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.13953">arXiv:2306.13953</a> <span> [<a href="https://arxiv.org/pdf/2306.13953">pdf</a>, <a href="https://arxiv.org/format/2306.13953">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> An Analysis of Personalized Speech Recognition System Development for the Deaf and Hard-of-Hearing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Violeta%2C+L+P">Lester Phillip Violeta</a>, <a href="/search/cs?searchtype=author&query=Toda%2C+T">Tomoki Toda</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.13953v1-abstract-short" style="display: inline;"> Deaf or hard-of-hearing (DHH) speakers typically have atypical speech caused by deafness. With the growing support of speech-based devices and software applications, more work needs to be done to make these devices inclusive to everyone. To do so, we analyze the use of openly-available automatic speech recognition (ASR) tools with a DHH Japanese speaker dataset. As these out-of-the-box ASR models… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.13953v1-abstract-full').style.display = 'inline'; document.getElementById('2306.13953v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.13953v1-abstract-full" style="display: none;"> Deaf or hard-of-hearing (DHH) speakers typically have atypical speech caused by deafness. With the growing support of speech-based devices and software applications, more work needs to be done to make these devices inclusive to everyone. To do so, we analyze the use of openly-available automatic speech recognition (ASR) tools with a DHH Japanese speaker dataset. As these out-of-the-box ASR models typically do not perform well on DHH speech, we provide a thorough analysis of creating personalized ASR systems. We collected a large DHH speaker dataset of four speakers totaling around 28.05 hours and thoroughly analyzed the performance of different training frameworks by varying the training data sizes. Our findings show that 1000 utterances (or 1-2 hours) from a target speaker can already significantly improve the model performance with minimal amount of work needed, thus we recommend researchers to collect at least 1000 utterances to make an efficient personalized ASR system. In cases where 1000 utterances is difficult to collect, we also discover significant improvements in using previously proposed data augmentation techniques such as intermediate fine-tuning when only 200 utterances are available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.13953v1-abstract-full').style.display = 'none'; document.getElementById('2306.13953v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to APSIPA 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2212.08329">arXiv:2212.08329</a> <span> [<a href="https://arxiv.org/pdf/2212.08329">pdf</a>, <a href="https://arxiv.org/format/2212.08329">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Text-to-speech synthesis based on latent variable conversion using diffusion probabilistic model and variational autoencoder </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yasuda%2C+Y">Yusuke Yasuda</a>, <a href="/search/cs?searchtype=author&query=Toda%2C+T">Tomoki Toda</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2212.08329v1-abstract-short" style="display: inline;"> Text-to-speech synthesis (TTS) is a task to convert texts into speech. Two of the factors that have been driving TTS are the advancements of probabilistic models and latent representation learning. We propose a TTS method based on latent variable conversion using a diffusion probabilistic model and the variational autoencoder (VAE). In our TTS method, we use a waveform model based on VAE, a diffus… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.08329v1-abstract-full').style.display = 'inline'; document.getElementById('2212.08329v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2212.08329v1-abstract-full" style="display: none;"> Text-to-speech synthesis (TTS) is a task to convert texts into speech. Two of the factors that have been driving TTS are the advancements of probabilistic models and latent representation learning. We propose a TTS method based on latent variable conversion using a diffusion probabilistic model and the variational autoencoder (VAE). In our TTS method, we use a waveform model based on VAE, a diffusion model that predicts the distribution of latent variables in the waveform model from texts, and an alignment model that learns alignments between the text and speech latent sequences. Our method integrates diffusion with VAE by modeling both mean and variance parameters with diffusion, where the target distribution is determined by approximation from VAE. This latent variable conversion framework potentially enables us to flexibly incorporate various latent feature extractors. Our experiments show that our method is robust to linguistic labels with poor orthography and alignment errors. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.08329v1-abstract-full').style.display = 'none'; document.getElementById('2212.08329v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 December, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to ICASSP 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2212.08321">arXiv:2212.08321</a> <span> [<a href="https://arxiv.org/pdf/2212.08321">pdf</a>, <a href="https://arxiv.org/format/2212.08321">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/JSTSP.2022.3190672">10.1109/JSTSP.2022.3190672 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Investigation of Japanese PnG BERT language model in text-to-speech synthesis for pitch accent language </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yasuda%2C+Y">Yusuke Yasuda</a>, <a href="/search/cs?searchtype=author&query=Toda%2C+T">Tomoki Toda</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2212.08321v1-abstract-short" style="display: inline;"> End-to-end text-to-speech synthesis (TTS) can generate highly natural synthetic speech from raw text. However, rendering the correct pitch accents is still a challenging problem for end-to-end TTS. To tackle the challenge of rendering correct pitch accent in Japanese end-to-end TTS, we adopt PnG~BERT, a self-supervised pretrained model in the character and phoneme domain for TTS. We investigate th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.08321v1-abstract-full').style.display = 'inline'; document.getElementById('2212.08321v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2212.08321v1-abstract-full" style="display: none;"> End-to-end text-to-speech synthesis (TTS) can generate highly natural synthetic speech from raw text. However, rendering the correct pitch accents is still a challenging problem for end-to-end TTS. To tackle the challenge of rendering correct pitch accent in Japanese end-to-end TTS, we adopt PnG~BERT, a self-supervised pretrained model in the character and phoneme domain for TTS. We investigate the effects of features captured by PnG~BERT on Japanese TTS by modifying the fine-tuning condition to determine the conditions helpful inferring pitch accents. We manipulate content of PnG~BERT features from being text-oriented to speech-oriented by changing the number of fine-tuned layers during TTS. In addition, we teach PnG~BERT pitch accent information by fine-tuning with tone prediction as an additional downstream task. Our experimental results show that the features of PnG~BERT captured by pretraining contain information helpful inferring pitch accent, and PnG~BERT outperforms baseline Tacotron on accent correctness in a listening test. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.08321v1-abstract-full').style.display = 'none'; document.getElementById('2212.08321v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 December, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> IEEE Journal of Selected Topics in Signal Processing (Volume: 16, Issue: 6, October 2022) </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2211.07863">arXiv:2211.07863</a> <span> [<a href="https://arxiv.org/pdf/2211.07863">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Music Similarity Calculation of Individual Instrumental Sounds Using Metric Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hashizume%2C+Y">Yuka Hashizume</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Li Li</a>, <a href="/search/cs?searchtype=author&query=Toda%2C+T">Tomoki Toda</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2211.07863v1-abstract-short" style="display: inline;"> The criteria for measuring music similarity are important for developing a flexible music recommendation system. Some data-driven methods have been proposed to calculate music similarity from only music signals, such as metric learning based on a triplet loss using tag information on each musical piece. However, the resulting music similarity metric usually captures the entire piece of music, i.e.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.07863v1-abstract-full').style.display = 'inline'; document.getElementById('2211.07863v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2211.07863v1-abstract-full" style="display: none;"> The criteria for measuring music similarity are important for developing a flexible music recommendation system. Some data-driven methods have been proposed to calculate music similarity from only music signals, such as metric learning based on a triplet loss using tag information on each musical piece. However, the resulting music similarity metric usually captures the entire piece of music, i.e., the mixing of various instrumental sound sources, limiting the capability of the music recommendation system, e.g., it is difficult to search for a musical piece containing similar drum sounds. Towards the development of a more flexible music recommendation system, we propose a music similarity calculation method that focuses on individual instrumental sound sources in a musical piece. By fully exploiting the potential of data-driven methods for our proposed method, we employ weakly supervised metric learning to individual instrumental sound source signals without using any tag information, where positive and negative samples in a triplet loss are defined by whether or not they are from the same musical piece. Furthermore, assuming that each instrumental sound source is not always available in practice, we also investigate the effects of using instrumental sound source separation to obtain each source in the proposed method. Experimental results have shown that (1) unique similarity metrics can be learned for individual instrumental sound sources, (2) similarity metrics learned using some instrumental sound sources are possible to lead to more accurate results than that learned using the entire musical piece, (3) the performance degraded when learning with the separated instrumental sounds, and (4) similarity metrics learned by the proposed method well produced results that correspond to perception by human senses. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.07863v1-abstract-full').style.display = 'none'; document.getElementById('2211.07863v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">APSIPA ASC 2022 (pp.33--38)</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68T99 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2211.01198">arXiv:2211.01198</a> <span> [<a href="https://arxiv.org/pdf/2211.01198">pdf</a>, <a href="https://arxiv.org/format/2211.01198">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Analysis of Noisy-target Training for DNN-based speech enhancement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fujimura%2C+T">Takuya Fujimura</a>, <a href="/search/cs?searchtype=author&query=Toda%2C+T">Tomoki Toda</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2211.01198v1-abstract-short" style="display: inline;"> Deep neural network (DNN)-based speech enhancement usually uses a clean speech as a training target. However, it is hard to collect large amounts of clean speech because the recording is very costly. In other words, the performance of current speech enhancement has been limited by the amount of training data. To relax this limitation, Noisy-target Training (NyTT) that utilizes noisy speech as a tr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.01198v1-abstract-full').style.display = 'inline'; document.getElementById('2211.01198v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2211.01198v1-abstract-full" style="display: none;"> Deep neural network (DNN)-based speech enhancement usually uses a clean speech as a training target. However, it is hard to collect large amounts of clean speech because the recording is very costly. In other words, the performance of current speech enhancement has been limited by the amount of training data. To relax this limitation, Noisy-target Training (NyTT) that utilizes noisy speech as a training target has been proposed. Although it has been experimentally shown that NyTT can train a DNN without clean speech, a detailed analysis has not been conducted and its behavior has not been understood well. In this paper, we conduct various analyses to deepen our understanding of NyTT. In addition, based on the property of NyTT, we propose a refined method that is comparable to the method using clean speech. Furthermore, we show that we can improve the performance by using a huge amount of noisy speech with clean speech. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.01198v1-abstract-full').style.display = 'none'; document.getElementById('2211.01198v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to ICASSP 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2211.01079">arXiv:2211.01079</a> <span> [<a href="https://arxiv.org/pdf/2211.01079">pdf</a>, <a href="https://arxiv.org/format/2211.01079">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Intermediate Fine-Tuning Using Imperfect Synthetic Speech for Improving Electrolaryngeal Speech Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Violeta%2C+L+P">Lester Phillip Violeta</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+D">Ding Ma</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+W">Wen-Chin Huang</a>, <a href="/search/cs?searchtype=author&query=Toda%2C+T">Tomoki Toda</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2211.01079v2-abstract-short" style="display: inline;"> Research on automatic speech recognition (ASR) systems for electrolaryngeal speakers has been relatively unexplored due to small datasets. When training data is lacking in ASR, a large-scale pretraining and fine tuning framework is often sufficient to achieve high recognition rates; however, in electrolaryngeal speech, the domain shift between the pretraining and fine-tuning data is too large to o… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.01079v2-abstract-full').style.display = 'inline'; document.getElementById('2211.01079v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2211.01079v2-abstract-full" style="display: none;"> Research on automatic speech recognition (ASR) systems for electrolaryngeal speakers has been relatively unexplored due to small datasets. When training data is lacking in ASR, a large-scale pretraining and fine tuning framework is often sufficient to achieve high recognition rates; however, in electrolaryngeal speech, the domain shift between the pretraining and fine-tuning data is too large to overcome, limiting the maximum improvement of recognition rates. To resolve this, we propose an intermediate fine-tuning step that uses imperfect synthetic speech to close the domain shift gap between the pretraining and target data. Despite the imperfect synthetic data, we show the effectiveness of this on electrolaryngeal speech datasets, with improvements of 6.1% over the baseline that did not use imperfect synthetic speech. Results show how the intermediate fine-tuning stage focuses on learning the high-level inherent features of the imperfect synthetic data rather than the low-level features such as intelligibility. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.01079v2-abstract-full').style.display = 'none'; document.getElementById('2211.01079v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ICASSP 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2210.15987">arXiv:2210.15987</a> <span> [<a href="https://arxiv.org/pdf/2210.15987">pdf</a>, <a href="https://arxiv.org/format/2210.15987">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> NNSVS: A Neural Network-Based Singing Voice Synthesis Toolkit </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yamamoto%2C+R">Ryuichi Yamamoto</a>, <a href="/search/cs?searchtype=author&query=Yoneyama%2C+R">Reo Yoneyama</a>, <a href="/search/cs?searchtype=author&query=Toda%2C+T">Tomoki Toda</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2210.15987v2-abstract-short" style="display: inline;"> This paper describes the design of NNSVS, an open-source software for neural network-based singing voice synthesis research. NNSVS is inspired by Sinsy, an open-source pioneer in singing voice synthesis research, and provides many additional features such as multi-stream models, autoregressive fundamental frequency models, and neural vocoders. Furthermore, NNSVS provides extensive documentation an… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.15987v2-abstract-full').style.display = 'inline'; document.getElementById('2210.15987v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2210.15987v2-abstract-full" style="display: none;"> This paper describes the design of NNSVS, an open-source software for neural network-based singing voice synthesis research. NNSVS is inspired by Sinsy, an open-source pioneer in singing voice synthesis research, and provides many additional features such as multi-stream models, autoregressive fundamental frequency models, and neural vocoders. Furthermore, NNSVS provides extensive documentation and numerous scripts to build complete singing voice synthesis systems. Experimental results demonstrate that our best system significantly outperforms our reproduction of Sinsy and other baseline systems. The toolkit is available at https://github.com/nnsvs/nnsvs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.15987v2-abstract-full').style.display = 'none'; document.getElementById('2210.15987v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ICASSP 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2210.15533">arXiv:2210.15533</a> <span> [<a href="https://arxiv.org/pdf/2210.15533">pdf</a>, <a href="https://arxiv.org/format/2210.15533">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Source-Filter HiFi-GAN: Fast and Pitch Controllable High-Fidelity Neural Vocoder </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yoneyama%2C+R">Reo Yoneyama</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yi-Chiao Wu</a>, <a href="/search/cs?searchtype=author&query=Toda%2C+T">Tomoki Toda</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2210.15533v3-abstract-short" style="display: inline;"> Our previous work, the unified source-filter GAN (uSFGAN) vocoder, introduced a novel architecture based on the source-filter theory into the parallel waveform generative adversarial network to achieve high voice quality and pitch controllability. However, the high temporal resolution inputs result in high computation costs. Although the HiFi-GAN vocoder achieves fast high-fidelity voice generatio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.15533v3-abstract-full').style.display = 'inline'; document.getElementById('2210.15533v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2210.15533v3-abstract-full" style="display: none;"> Our previous work, the unified source-filter GAN (uSFGAN) vocoder, introduced a novel architecture based on the source-filter theory into the parallel waveform generative adversarial network to achieve high voice quality and pitch controllability. However, the high temporal resolution inputs result in high computation costs. Although the HiFi-GAN vocoder achieves fast high-fidelity voice generation thanks to the efficient upsampling-based generator architecture, the pitch controllability is severely limited. To realize a fast and pitch-controllable high-fidelity neural vocoder, we introduce the source-filter theory into HiFi-GAN by hierarchically conditioning the resonance filtering network on a well-estimated source excitation information. According to the experimental results, our proposed method outperforms HiFi-GAN and uSFGAN on a singing voice generation in voice quality and synthesis speed on a single CPU. Furthermore, unlike the uSFGAN vocoder, the proposed method can be easily adopted/integrated in real-time applications and end-to-end systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.15533v3-abstract-full').style.display = 'none'; document.getElementById('2210.15533v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 February, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ICASSP 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2210.10314">arXiv:2210.10314</a> <span> [<a href="https://arxiv.org/pdf/2210.10314">pdf</a>, <a href="https://arxiv.org/format/2210.10314">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Two-stage training method for Japanese electrolaryngeal speech enhancement based on sequence-to-sequence voice conversion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ma%2C+D">Ding Ma</a>, <a href="/search/cs?searchtype=author&query=Violeta%2C+L+P">Lester Phillip Violeta</a>, <a href="/search/cs?searchtype=author&query=Kobayashi%2C+K">Kazuhiro Kobayashi</a>, <a href="/search/cs?searchtype=author&query=Toda%2C+T">Tomoki Toda</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2210.10314v1-abstract-short" style="display: inline;"> Sequence-to-sequence (seq2seq) voice conversion (VC) models have greater potential in converting electrolaryngeal (EL) speech to normal speech (EL2SP) compared to conventional VC models. However, EL2SP based on seq2seq VC requires a sufficiently large amount of parallel data for the model training and it suffers from significant performance degradation when the amount of training data is insuffici… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.10314v1-abstract-full').style.display = 'inline'; document.getElementById('2210.10314v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2210.10314v1-abstract-full" style="display: none;"> Sequence-to-sequence (seq2seq) voice conversion (VC) models have greater potential in converting electrolaryngeal (EL) speech to normal speech (EL2SP) compared to conventional VC models. However, EL2SP based on seq2seq VC requires a sufficiently large amount of parallel data for the model training and it suffers from significant performance degradation when the amount of training data is insufficient. To address this issue, we suggest a novel, two-stage strategy to optimize the performance on EL2SP based on seq2seq VC when a small amount of the parallel dataset is available. In contrast to utilizing high-quality data augmentations in previous studies, we first combine a large amount of imperfect synthetic parallel data of EL and normal speech, with the original dataset into VC training. Then, a second stage training is conducted with the original parallel dataset only. The results show that the proposed method progressively improves the performance of EL2SP based on seq2seq VC. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.10314v1-abstract-full').style.display = 'none'; document.getElementById('2210.10314v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to SLT 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2207.13959">arXiv:2207.13959</a> <span> [<a href="https://arxiv.org/pdf/2207.13959">pdf</a>, <a href="https://arxiv.org/format/2207.13959">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Data Structures and Algorithms">cs.DS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Discrete Mathematics">cs.DM</span> </div> </div> <p class="title is-5 mathjax"> ZDD-Based Algorithmic Framework for Solving Shortest Reconfiguration Problems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ito%2C+T">Takehiro Ito</a>, <a href="/search/cs?searchtype=author&query=Kawahara%2C+J">Jun Kawahara</a>, <a href="/search/cs?searchtype=author&query=Nakahata%2C+Y">Yu Nakahata</a>, <a href="/search/cs?searchtype=author&query=Soh%2C+T">Takehide Soh</a>, <a href="/search/cs?searchtype=author&query=Suzuki%2C+A">Akira Suzuki</a>, <a href="/search/cs?searchtype=author&query=Teruyama%2C+J">Junichi Teruyama</a>, <a href="/search/cs?searchtype=author&query=Toda%2C+T">Takahisa Toda</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2207.13959v2-abstract-short" style="display: inline;"> This paper proposes an algorithmic framework for various reconfiguration problems using zero-suppressed binary decision diagrams (ZDDs), a data structure for families of sets. In general, a reconfiguration problem checks if there is a step-by-step transformation between two given feasible solutions (e.g., independent sets of an input graph) of a fixed search problem such that all intermediate resu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2207.13959v2-abstract-full').style.display = 'inline'; document.getElementById('2207.13959v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2207.13959v2-abstract-full" style="display: none;"> This paper proposes an algorithmic framework for various reconfiguration problems using zero-suppressed binary decision diagrams (ZDDs), a data structure for families of sets. In general, a reconfiguration problem checks if there is a step-by-step transformation between two given feasible solutions (e.g., independent sets of an input graph) of a fixed search problem such that all intermediate results are also feasible and each step obeys a fixed reconfiguration rule (e.g., adding/removing a single vertex to/from an independent set). The solution space formed by all feasible solutions can be exponential in the input size, and indeed many reconfiguration problems are known to be PSPACE-complete. This paper shows that an algorithm in the proposed framework efficiently conducts the breadth-first search by compressing the solution space using ZDDs, and finds a shortest transformation between two given feasible solutions if exists. Moreover, the proposed framework provides rich information on the solution space, such as the connectivity of the solution space and all feasible solutions reachable from a specified one. We demonstrate that the proposed framework can be applied to various reconfiguration problems, and experimentally evaluate their performances. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2207.13959v2-abstract-full').style.display = 'none'; document.getElementById('2207.13959v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 December, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 July, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2207.05913">arXiv:2207.05913</a> <span> [<a href="https://arxiv.org/pdf/2207.05913">pdf</a>, <a href="https://arxiv.org/format/2207.05913">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1561/116.00000020">10.1561/116.00000020 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> A Cyclical Approach to Synthetic and Natural Speech Mismatch Refinement of Neural Post-filter for Low-cost Text-to-speech System </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yi-Chiao Wu</a>, <a href="/search/cs?searchtype=author&query=Tobing%2C+P+L">Patrick Lumban Tobing</a>, <a href="/search/cs?searchtype=author&query=Yasuhara%2C+K">Kazuki Yasuhara</a>, <a href="/search/cs?searchtype=author&query=Matsunaga%2C+N">Noriyuki Matsunaga</a>, <a href="/search/cs?searchtype=author&query=Ohtani%2C+Y">Yamato Ohtani</a>, <a href="/search/cs?searchtype=author&query=Toda%2C+T">Tomoki Toda</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2207.05913v1-abstract-short" style="display: inline;"> Neural-based text-to-speech (TTS) systems achieve very high-fidelity speech generation because of the rapid neural network developments. However, the huge labeled corpus and high computation cost requirements limit the possibility of developing a high-fidelity TTS system by small companies or individuals. On the other hand, a neural vocoder, which has been widely adopted for the speech generation… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2207.05913v1-abstract-full').style.display = 'inline'; document.getElementById('2207.05913v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2207.05913v1-abstract-full" style="display: none;"> Neural-based text-to-speech (TTS) systems achieve very high-fidelity speech generation because of the rapid neural network developments. However, the huge labeled corpus and high computation cost requirements limit the possibility of developing a high-fidelity TTS system by small companies or individuals. On the other hand, a neural vocoder, which has been widely adopted for the speech generation in neural-based TTS systems, can be trained with a relatively small unlabeled corpus. Therefore, in this paper, we explore a general framework to develop a neural post-filter (NPF) for low-cost TTS systems using neural vocoders. A cyclical approach is proposed to tackle the acoustic and temporal mismatches (AM and TM) of developing an NPF. Both objective and subjective evaluations have been conducted to demonstrate the AM and TM problems and the effectiveness of the proposed framework. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2207.05913v1-abstract-full').style.display = 'none'; document.getElementById('2207.05913v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 July, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages, 7 figures, 10 tables</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> APSIPA Transactions on Signal and Information Processing, Vol 11, Issue 1, 2022 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2207.04356">arXiv:2207.04356</a> <span> [<a href="https://arxiv.org/pdf/2207.04356">pdf</a>, <a href="https://arxiv.org/format/2207.04356">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/JSTSP.2022.3193761">10.1109/JSTSP.2022.3193761 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> A Comparative Study of Self-supervised Speech Representation Based Voice Conversion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+W">Wen-Chin Huang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Shu-Wen Yang</a>, <a href="/search/cs?searchtype=author&query=Hayashi%2C+T">Tomoki Hayashi</a>, <a href="/search/cs?searchtype=author&query=Toda%2C+T">Tomoki Toda</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2207.04356v1-abstract-short" style="display: inline;"> We present a large-scale comparative study of self-supervised speech representation (S3R)-based voice conversion (VC). In the context of recognition-synthesis VC, S3Rs are attractive owing to their potential to replace expensive supervised representations such as phonetic posteriorgrams (PPGs), which are commonly adopted by state-of-the-art VC systems. Using S3PRL-VC, an open-source VC software we… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2207.04356v1-abstract-full').style.display = 'inline'; document.getElementById('2207.04356v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2207.04356v1-abstract-full" style="display: none;"> We present a large-scale comparative study of self-supervised speech representation (S3R)-based voice conversion (VC). In the context of recognition-synthesis VC, S3Rs are attractive owing to their potential to replace expensive supervised representations such as phonetic posteriorgrams (PPGs), which are commonly adopted by state-of-the-art VC systems. Using S3PRL-VC, an open-source VC software we previously developed, we provide a series of in-depth objective and subjective analyses under three VC settings: intra-/cross-lingual any-to-one (A2O) and any-to-any (A2A) VC, using the voice conversion challenge 2020 (VCC2020) dataset. We investigated S3R-based VC in various aspects, including model type, multilinguality, and supervision. We also studied the effect of a post-discretization process with k-means clustering and showed how it improves in the A2A setting. Finally, the comparison with state-of-the-art VC systems demonstrates the competitiveness of S3R-based VC and also sheds light on the possible improving directions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2207.04356v1-abstract-full').style.display = 'none'; document.getElementById('2207.04356v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 July, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to IEEE Journal of Selected Topics in Signal Processing. arXiv admin note: substantial text overlap with arXiv:2110.06280</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2206.15155">arXiv:2206.15155</a> <span> [<a href="https://arxiv.org/pdf/2206.15155">pdf</a>, <a href="https://arxiv.org/format/2206.15155">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> An Evaluation of Three-Stage Voice Conversion Framework for Noisy and Reverberant Conditions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Choi%2C+Y">Yeonjong Choi</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+C">Chao Xie</a>, <a href="/search/cs?searchtype=author&query=Toda%2C+T">Tomoki Toda</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2206.15155v1-abstract-short" style="display: inline;"> This paper presents a new voice conversion (VC) framework capable of dealing with both additive noise and reverberation, and its performance evaluation. There have been studied some VC researches focusing on real-world circumstances where speech data are interfered with background noise and reverberation. To deal with more practical conditions where no clean target dataset is available, one possib… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.15155v1-abstract-full').style.display = 'inline'; document.getElementById('2206.15155v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2206.15155v1-abstract-full" style="display: none;"> This paper presents a new voice conversion (VC) framework capable of dealing with both additive noise and reverberation, and its performance evaluation. There have been studied some VC researches focusing on real-world circumstances where speech data are interfered with background noise and reverberation. To deal with more practical conditions where no clean target dataset is available, one possible approach is zero-shot VC, but its performance tends to degrade compared with VC using sufficient amount of target speech data. To leverage large amount of noisy-reverberant target speech data, we propose a three-stage VC framework based on denoising process using a pretrained denoising model, dereverberation process using a dereverberation model, and VC process using a nonparallel VC model based on a variational autoencoder. The experimental results show that 1) noise and reverberation additively cause significant VC performance degradation, 2) the proposed method alleviates the adverse effects caused by both noise and reverberation, and significantly outperforms the baseline directly trained on the noisy-reverberant speech data, and 3) the potential degradation introduced by the denoising and dereverberation still causes noticeable adverse effects on VC performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.15155v1-abstract-full').style.display = 'none'; document.getElementById('2206.15155v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 June, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to INTERSPEECH 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2206.05929">arXiv:2206.05929</a> <span> [<a href="https://arxiv.org/pdf/2206.05929">pdf</a>, <a href="https://arxiv.org/format/2206.05929">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Improvement of Serial Approach to Anomalous Sound Detection by Incorporating Two Binary Cross-Entropies for Outlier Exposure </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kuroyanagi%2C+I">Ibuki Kuroyanagi</a>, <a href="/search/cs?searchtype=author&query=Hayashi%2C+T">Tomoki Hayashi</a>, <a href="/search/cs?searchtype=author&query=Takeda%2C+K">Kazuya Takeda</a>, <a href="/search/cs?searchtype=author&query=Toda%2C+T">Tomoki Toda</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2206.05929v1-abstract-short" style="display: inline;"> Anomalous sound detection systems must detect unknown, atypical sounds using only normal audio data. Conventional methods use the serial method, a combination of outlier exposure (OE), which classifies normal and pseudo-anomalous data and obtains embedding, and inlier modeling (IM), which models the probability distribution of the embedding. Although the serial method shows high performance due to… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.05929v1-abstract-full').style.display = 'inline'; document.getElementById('2206.05929v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2206.05929v1-abstract-full" style="display: none;"> Anomalous sound detection systems must detect unknown, atypical sounds using only normal audio data. Conventional methods use the serial method, a combination of outlier exposure (OE), which classifies normal and pseudo-anomalous data and obtains embedding, and inlier modeling (IM), which models the probability distribution of the embedding. Although the serial method shows high performance due to the powerful feature extraction of OE and the robustness of IM, OE still has a problem that doesn't work well when the normal and pseudo-anomalous data are too similar or too different. To explicitly distinguish these data, the proposed method uses multi-task learning of two binary cross-entropies when training OE. The first is a loss that classifies the sound of the target machine to which product it is emitted from, which deals with the case where the normal data and the pseudo-anomalous data are too similar. The second is a loss that identifies whether the sound is emitted from the target machine or not, which deals with the case where the normal data and the pseudo-anomalous data are too different. We perform our experiments with DCASE 2021 Task~2 dataset. Our proposed single-model method outperforms the top-ranked method, which combines multiple models, by 2.1% in AUC. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.05929v1-abstract-full').style.display = 'none'; document.getElementById('2206.05929v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 June, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 3 figures, 3 tables, EUSIPCO 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2205.06053">arXiv:2205.06053</a> <span> [<a href="https://arxiv.org/pdf/2205.06053">pdf</a>, <a href="https://arxiv.org/format/2205.06053">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Unified Source-Filter GAN with Harmonic-plus-Noise Source Excitation Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yoneyama%2C+R">Reo Yoneyama</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yi-Chiao Wu</a>, <a href="/search/cs?searchtype=author&query=Toda%2C+T">Tomoki Toda</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2205.06053v2-abstract-short" style="display: inline;"> This paper introduces a unified source-filter network with a harmonic-plus-noise source excitation generation mechanism. In our previous work, we proposed unified Source-Filter GAN (uSFGAN) for developing a high-fidelity neural vocoder with flexible voice controllability using a unified source-filter neural network architecture. However, the capability of uSFGAN to model the aperiodic source excit… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2205.06053v2-abstract-full').style.display = 'inline'; document.getElementById('2205.06053v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2205.06053v2-abstract-full" style="display: none;"> This paper introduces a unified source-filter network with a harmonic-plus-noise source excitation generation mechanism. In our previous work, we proposed unified Source-Filter GAN (uSFGAN) for developing a high-fidelity neural vocoder with flexible voice controllability using a unified source-filter neural network architecture. However, the capability of uSFGAN to model the aperiodic source excitation signal is insufficient, and there is still a gap in sound quality between the natural and generated speech. To improve the source excitation modeling and generated sound quality, a new source excitation generation network separately generating periodic and aperiodic components is proposed. The advanced adversarial training procedure of HiFiGAN is also adopted to replace that of Parallel WaveGAN used in the original uSFGAN. Both objective and subjective evaluation results show that the modified uSFGAN significantly improves the sound quality of the basic uSFGAN while maintaining the voice controllability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2205.06053v2-abstract-full').style.display = 'none'; document.getElementById('2205.06053v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 June, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 May, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to INTERSPEECH 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2203.15431">arXiv:2203.15431</a> <span> [<a href="https://arxiv.org/pdf/2203.15431">pdf</a>, <a href="https://arxiv.org/format/2203.15431">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Investigating Self-supervised Pretraining Frameworks for Pathological Speech Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Violeta%2C+L+P">Lester Phillip Violeta</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+W">Wen-Chin Huang</a>, <a href="/search/cs?searchtype=author&query=Toda%2C+T">Tomoki Toda</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2203.15431v3-abstract-short" style="display: inline;"> We investigate the performance of self-supervised pretraining frameworks on pathological speech datasets used for automatic speech recognition (ASR). Modern end-to-end models require thousands of hours of data to train well, but only a small number of pathological speech datasets are publicly available. A proven solution to this problem is by first pretraining the model on a huge number of healthy… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.15431v3-abstract-full').style.display = 'inline'; document.getElementById('2203.15431v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2203.15431v3-abstract-full" style="display: none;"> We investigate the performance of self-supervised pretraining frameworks on pathological speech datasets used for automatic speech recognition (ASR). Modern end-to-end models require thousands of hours of data to train well, but only a small number of pathological speech datasets are publicly available. A proven solution to this problem is by first pretraining the model on a huge number of healthy speech datasets and then fine-tuning it on the pathological speech datasets. One new pretraining framework called self-supervised learning (SSL) trains a network using only speech data, providing more flexibility in training data requirements and allowing more speech data to be used in pretraining. We investigate SSL frameworks such as the wav2vec 2.0 and WavLM models using different setups and compare their performance with different supervised pretraining setups, using two types of pathological speech, namely, Japanese electrolaryngeal and English dysarthric. Our results show that although SSL has shown success with minimally resourced healthy speech, we do not find this to be the case with pathological speech. The best supervised setup outperforms the best SSL setup by 13.9% character error rate in electrolaryngeal speech and 16.8% word error rate in dysarthric speech. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.15431v3-abstract-full').style.display = 'none'; document.getElementById('2203.15431v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 June, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 March, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to INTERSPEECH 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2203.11389">arXiv:2203.11389</a> <span> [<a href="https://arxiv.org/pdf/2203.11389">pdf</a>, <a href="https://arxiv.org/format/2203.11389">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> The VoiceMOS Challenge 2022 </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+W">Wen-Chin Huang</a>, <a href="/search/cs?searchtype=author&query=Cooper%2C+E">Erica Cooper</a>, <a href="/search/cs?searchtype=author&query=Tsao%2C+Y">Yu Tsao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hsin-Min Wang</a>, <a href="/search/cs?searchtype=author&query=Toda%2C+T">Tomoki Toda</a>, <a href="/search/cs?searchtype=author&query=Yamagishi%2C+J">Junichi Yamagishi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2203.11389v3-abstract-short" style="display: inline;"> We present the first edition of the VoiceMOS Challenge, a scientific event that aims to promote the study of automatic prediction of the mean opinion score (MOS) of synthetic speech. This challenge drew 22 participating teams from academia and industry who tried a variety of approaches to tackle the problem of predicting human ratings of synthesized speech. The listening test data for the main tra… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.11389v3-abstract-full').style.display = 'inline'; document.getElementById('2203.11389v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2203.11389v3-abstract-full" style="display: none;"> We present the first edition of the VoiceMOS Challenge, a scientific event that aims to promote the study of automatic prediction of the mean opinion score (MOS) of synthetic speech. This challenge drew 22 participating teams from academia and industry who tried a variety of approaches to tackle the problem of predicting human ratings of synthesized speech. The listening test data for the main track of the challenge consisted of samples from 187 different text-to-speech and voice conversion systems spanning over a decade of research, and the out-of-domain track consisted of data from more recent systems rated in a separate listening test. Results of the challenge show the effectiveness of fine-tuning self-supervised speech models for the MOS prediction task, as well as the difficulty of predicting MOS ratings for unseen speakers and listeners, and for unseen systems in the out-of-domain setting. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.11389v3-abstract-full').style.display = 'none'; document.getElementById('2203.11389v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 July, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 March, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to Interspeech 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2111.07116">arXiv:2111.07116</a> <span> [<a href="https://arxiv.org/pdf/2111.07116">pdf</a>, <a href="https://arxiv.org/format/2111.07116">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Direct Noisy Speech Modeling for Noisy-to-Noisy Voice Conversion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xie%2C+C">Chao Xie</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yi-Chiao Wu</a>, <a href="/search/cs?searchtype=author&query=Tobing%2C+P+L">Patrick Lumban Tobing</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+W">Wen-Chin Huang</a>, <a href="/search/cs?searchtype=author&query=Toda%2C+T">Tomoki Toda</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2111.07116v1-abstract-short" style="display: inline;"> Beyond the conventional voice conversion (VC) where the speaker information is converted without altering the linguistic content, the background sounds are informative and need to be retained in some real-world scenarios, such as VC in movie/video and VC in music where the voice is entangled with background sounds. As a new VC framework, we have developed a noisy-to-noisy (N2N) VC framework to con… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2111.07116v1-abstract-full').style.display = 'inline'; document.getElementById('2111.07116v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2111.07116v1-abstract-full" style="display: none;"> Beyond the conventional voice conversion (VC) where the speaker information is converted without altering the linguistic content, the background sounds are informative and need to be retained in some real-world scenarios, such as VC in movie/video and VC in music where the voice is entangled with background sounds. As a new VC framework, we have developed a noisy-to-noisy (N2N) VC framework to convert the speaker's identity while preserving the background sounds. Although our framework consisting of a denoising module and a VC module well handles the background sounds, the VC module is sensitive to the distortion caused by the denoising module. To address this distortion issue, in this paper we propose the improved VC module to directly model the noisy speech waveform while controlling the background sounds. The experimental results have demonstrated that our improved framework significantly outperforms the previous one and achieves an acceptable score in terms of naturalness, while reaching comparable similarity performance to the upper bound of our framework. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2111.07116v1-abstract-full').style.display = 'none'; document.getElementById('2111.07116v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 November, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2111.05691">arXiv:2111.05691</a> <span> [<a href="https://arxiv.org/pdf/2111.05691">pdf</a>, <a href="https://arxiv.org/format/2111.05691">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> HASA-net: A non-intrusive hearing-aid speech assessment network </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chiang%2C+H">Hsin-Tien Chiang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yi-Chiao Wu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+C">Cheng Yu</a>, <a href="/search/cs?searchtype=author&query=Toda%2C+T">Tomoki Toda</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hsin-Min Wang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yih-Chun Hu</a>, <a href="/search/cs?searchtype=author&query=Tsao%2C+Y">Yu Tsao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2111.05691v1-abstract-short" style="display: inline;"> Without the need of a clean reference, non-intrusive speech assessment methods have caught great attention for objective evaluations. Recently, deep neural network (DNN) models have been applied to build non-intrusive speech assessment approaches and confirmed to provide promising performance. However, most DNN-based approaches are designed for normal-hearing listeners without considering hearing-… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2111.05691v1-abstract-full').style.display = 'inline'; document.getElementById('2111.05691v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2111.05691v1-abstract-full" style="display: none;"> Without the need of a clean reference, non-intrusive speech assessment methods have caught great attention for objective evaluations. Recently, deep neural network (DNN) models have been applied to build non-intrusive speech assessment approaches and confirmed to provide promising performance. However, most DNN-based approaches are designed for normal-hearing listeners without considering hearing-loss factors. In this study, we propose a DNN-based hearing aid speech assessment network (HASA-Net), formed by a bidirectional long short-term memory (BLSTM) model, to predict speech quality and intelligibility scores simultaneously according to input speech signals and specified hearing-loss patterns. To the best of our knowledge, HASA-Net is the first work to incorporate quality and intelligibility assessments utilizing a unified DNN-based non-intrusive model for hearing aids. Experimental results show that the predicted speech quality and intelligibility scores of HASA-Net are highly correlated to two well-known intrusive hearing-aid evaluation metrics, hearing aid speech quality index (HASQI) and hearing aid speech perception index (HASPI), respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2111.05691v1-abstract-full').style.display = 'none'; document.getElementById('2111.05691v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 November, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2110.09103">arXiv:2110.09103</a> <span> [<a href="https://arxiv.org/pdf/2110.09103">pdf</a>, <a href="https://arxiv.org/format/2110.09103">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> LDNet: Unified Listener Dependent Modeling in MOS Prediction for Synthetic Speech </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+W">Wen-Chin Huang</a>, <a href="/search/cs?searchtype=author&query=Cooper%2C+E">Erica Cooper</a>, <a href="/search/cs?searchtype=author&query=Yamagishi%2C+J">Junichi Yamagishi</a>, <a href="/search/cs?searchtype=author&query=Toda%2C+T">Tomoki Toda</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2110.09103v1-abstract-short" style="display: inline;"> An effective approach to automatically predict the subjective rating for synthetic speech is to train on a listening test dataset with human-annotated scores. Although each speech sample in the dataset is rated by several listeners, most previous works only used the mean score as the training target. In this work, we present LDNet, a unified framework for mean opinion score (MOS) prediction that p… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.09103v1-abstract-full').style.display = 'inline'; document.getElementById('2110.09103v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2110.09103v1-abstract-full" style="display: none;"> An effective approach to automatically predict the subjective rating for synthetic speech is to train on a listening test dataset with human-annotated scores. Although each speech sample in the dataset is rated by several listeners, most previous works only used the mean score as the training target. In this work, we present LDNet, a unified framework for mean opinion score (MOS) prediction that predicts the listener-wise perceived quality given the input speech and the listener identity. We reflect recent advances in LD modeling, including design choices of the model architecture, and propose two inference methods that provide more stable results and efficient computation. We conduct systematic experiments on the voice conversion challenge (VCC) 2018 benchmark and a newly collected large-scale MOS dataset, providing an in-depth analysis of the proposed framework. Results show that the mean listener inference method is a better way to utilize the mean scores, whose effectiveness is more obvious when having more ratings per sample. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.09103v1-abstract-full').style.display = 'none'; document.getElementById('2110.09103v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 October, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to ICASSP 2022. Code available at: https://github.com/unilight/LDNet</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2110.08213">arXiv:2110.08213</a> <span> [<a href="https://arxiv.org/pdf/2110.08213">pdf</a>, <a href="https://arxiv.org/format/2110.08213">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> </div> </div> <p class="title is-5 mathjax"> Towards Identity Preserving Normal to Dysarthric Voice Conversion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+W">Wen-Chin Huang</a>, <a href="/search/cs?searchtype=author&query=Halpern%2C+B+M">Bence Mark Halpern</a>, <a href="/search/cs?searchtype=author&query=Violeta%2C+L+P">Lester Phillip Violeta</a>, <a href="/search/cs?searchtype=author&query=Scharenborg%2C+O">Odette Scharenborg</a>, <a href="/search/cs?searchtype=author&query=Toda%2C+T">Tomoki Toda</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2110.08213v1-abstract-short" style="display: inline;"> We present a voice conversion framework that converts normal speech into dysarthric speech while preserving the speaker identity. Such a framework is essential for (1) clinical decision making processes and alleviation of patient stress, (2) data augmentation for dysarthric speech recognition. This is an especially challenging task since the converted samples should capture the severity of dysarth… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.08213v1-abstract-full').style.display = 'inline'; document.getElementById('2110.08213v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2110.08213v1-abstract-full" style="display: none;"> We present a voice conversion framework that converts normal speech into dysarthric speech while preserving the speaker identity. Such a framework is essential for (1) clinical decision making processes and alleviation of patient stress, (2) data augmentation for dysarthric speech recognition. This is an especially challenging task since the converted samples should capture the severity of dysarthric speech while being highly natural and possessing the speaker identity of the normal speaker. To this end, we adopted a two-stage framework, which consists of a sequence-to-sequence model and a nonparallel frame-wise model. Objective and subjective evaluations were conducted on the UASpeech dataset, and results showed that the method was able to yield reasonable naturalness and capture severity aspects of the pathological speech. On the other hand, the similarity to the normal source speaker's voice was limited and requires further improvements. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.08213v1-abstract-full').style.display = 'none'; document.getElementById('2110.08213v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 October, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to ICASSP 2022</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Toda%2C+T&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Toda%2C+T&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Toda%2C+T&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository