CINXE.COM
Search | arXiv e-print repository
<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 68 results for author: <span class="mathjax">Chung, J S</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> <div class="content"> <form method="GET" action="/search/eess" aria-role="search"> Searching in archive <strong>eess</strong>. <a href="/search/?searchtype=author&query=Chung%2C+J+S">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Chung, J S"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Chung%2C+J+S&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Chung, J S"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Chung%2C+J+S&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Chung%2C+J+S&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Chung%2C+J+S&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13839">arXiv:2410.13839</a> <span> [<a href="https://arxiv.org/pdf/2410.13839">pdf</a>, <a href="https://arxiv.org/format/2410.13839">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Accelerating Codec-based Speech Synthesis with Multi-Token Prediction and Speculative Decoding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Nguyen%2C+T+D">Tan Dat Nguyen</a>, <a href="/search/eess?searchtype=author&query=Kim%2C+J">Ji-Hoon Kim</a>, <a href="/search/eess?searchtype=author&query=Choi%2C+J">Jeongsoo Choi</a>, <a href="/search/eess?searchtype=author&query=Choi%2C+S">Shukjae Choi</a>, <a href="/search/eess?searchtype=author&query=Park%2C+J">Jinseok Park</a>, <a href="/search/eess?searchtype=author&query=Lee%2C+Y">Younglo Lee</a>, <a href="/search/eess?searchtype=author&query=Chung%2C+J+S">Joon Son Chung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13839v1-abstract-short" style="display: inline;"> The goal of this paper is to accelerate codec-based speech synthesis systems with minimum sacrifice to speech quality. We propose an enhanced inference method that allows for flexible trade-offs between speed and quality during inference without requiring additional training. Our core idea is to predict multiple tokens per inference step of the AR module using multiple prediction heads, resulting… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13839v1-abstract-full').style.display = 'inline'; document.getElementById('2410.13839v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13839v1-abstract-full" style="display: none;"> The goal of this paper is to accelerate codec-based speech synthesis systems with minimum sacrifice to speech quality. We propose an enhanced inference method that allows for flexible trade-offs between speed and quality during inference without requiring additional training. Our core idea is to predict multiple tokens per inference step of the AR module using multiple prediction heads, resulting in a linear reduction in synthesis time as the number of heads increases. Furthermore, we introduce a novel speculative decoding technique that utilises a Viterbi-based algorithm to select the optimal sequence of generated tokens at each decoding step. In our experiments, we demonstrate that the time required to predict each token is reduced by a factor of 4 to 5 compared to baseline models, with minimal quality trade-off or even improvement in terms of speech intelligibility. Audio samples are available at: multpletokensprediction.github.io/multipletokensprediction.github.io/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13839v1-abstract-full').style.display = 'none'; document.getElementById('2410.13839v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to IEEE ICASSP 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.17285">arXiv:2409.17285</a> <span> [<a href="https://arxiv.org/pdf/2409.17285">pdf</a>, <a href="https://arxiv.org/format/2409.17285">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> SpoofCeleb: Speech Deepfake Detection and SASV In The Wild </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Jung%2C+J">Jee-weon Jung</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+Y">Yihan Wu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+X">Xin Wang</a>, <a href="/search/eess?searchtype=author&query=Kim%2C+J">Ji-Hoon Kim</a>, <a href="/search/eess?searchtype=author&query=Maiti%2C+S">Soumi Maiti</a>, <a href="/search/eess?searchtype=author&query=Matsunaga%2C+Y">Yuta Matsunaga</a>, <a href="/search/eess?searchtype=author&query=Shim%2C+H">Hye-jin Shim</a>, <a href="/search/eess?searchtype=author&query=Tian%2C+J">Jinchuan Tian</a>, <a href="/search/eess?searchtype=author&query=Evans%2C+N">Nicholas Evans</a>, <a href="/search/eess?searchtype=author&query=Chung%2C+J+S">Joon Son Chung</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+W">Wangyou Zhang</a>, <a href="/search/eess?searchtype=author&query=Um%2C+S">Seyun Um</a>, <a href="/search/eess?searchtype=author&query=Takamichi%2C+S">Shinnosuke Takamichi</a>, <a href="/search/eess?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.17285v1-abstract-short" style="display: inline;"> This paper introduces SpoofCeleb, a dataset designed for Speech Deepfake Detection (SDD) and Spoofing-robust Automatic Speaker Verification (SASV), utilizing source data from real-world conditions and spoofing attacks generated by Text-To-Speech (TTS) systems also trained on the same real-world data. Robust recognition systems require speech data recorded in varied acoustic environments with diffe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.17285v1-abstract-full').style.display = 'inline'; document.getElementById('2409.17285v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.17285v1-abstract-full" style="display: none;"> This paper introduces SpoofCeleb, a dataset designed for Speech Deepfake Detection (SDD) and Spoofing-robust Automatic Speaker Verification (SASV), utilizing source data from real-world conditions and spoofing attacks generated by Text-To-Speech (TTS) systems also trained on the same real-world data. Robust recognition systems require speech data recorded in varied acoustic environments with different levels of noise to be trained. However, existing datasets typically include clean, high-quality recordings (bona fide data) due to the requirements for TTS training; studio-quality or well-recorded read speech is typically necessary to train TTS models. Existing SDD datasets also have limited usefulness for training SASV models due to insufficient speaker diversity. We present SpoofCeleb, which leverages a fully automated pipeline that processes the VoxCeleb1 dataset, transforming it into a suitable form for TTS training. We subsequently train 23 contemporary TTS systems. The resulting SpoofCeleb dataset comprises over 2.5 million utterances from 1,251 unique speakers, collected under natural, real-world conditions. The dataset includes carefully partitioned training, validation, and evaluation sets with well-controlled experimental protocols. We provide baseline results for both SDD and SASV tasks. All data, protocols, and baselines are publicly available at https://jungjee.github.io/spoofceleb. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.17285v1-abstract-full').style.display = 'none'; document.getElementById('2409.17285v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, 2 figures, 8 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.08711">arXiv:2409.08711</a> <span> [<a href="https://arxiv.org/pdf/2409.08711">pdf</a>, <a href="https://arxiv.org/format/2409.08711">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Text-To-Speech Synthesis In The Wild </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Jung%2C+J">Jee-weon Jung</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+W">Wangyou Zhang</a>, <a href="/search/eess?searchtype=author&query=Maiti%2C+S">Soumi Maiti</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+Y">Yihan Wu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+X">Xin Wang</a>, <a href="/search/eess?searchtype=author&query=Kim%2C+J">Ji-Hoon Kim</a>, <a href="/search/eess?searchtype=author&query=Matsunaga%2C+Y">Yuta Matsunaga</a>, <a href="/search/eess?searchtype=author&query=Um%2C+S">Seyun Um</a>, <a href="/search/eess?searchtype=author&query=Tian%2C+J">Jinchuan Tian</a>, <a href="/search/eess?searchtype=author&query=Shim%2C+H">Hye-jin Shim</a>, <a href="/search/eess?searchtype=author&query=Evans%2C+N">Nicholas Evans</a>, <a href="/search/eess?searchtype=author&query=Chung%2C+J+S">Joon Son Chung</a>, <a href="/search/eess?searchtype=author&query=Takamichi%2C+S">Shinnosuke Takamichi</a>, <a href="/search/eess?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.08711v1-abstract-short" style="display: inline;"> Text-to-speech (TTS) systems are traditionally trained using modest databases of studio-quality, prompted or read speech collected in benign acoustic environments such as anechoic rooms. The recent literature nonetheless shows efforts to train TTS systems using data collected in the wild. While this approach allows for the use of massive quantities of natural speech, until now, there are no common… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08711v1-abstract-full').style.display = 'inline'; document.getElementById('2409.08711v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.08711v1-abstract-full" style="display: none;"> Text-to-speech (TTS) systems are traditionally trained using modest databases of studio-quality, prompted or read speech collected in benign acoustic environments such as anechoic rooms. The recent literature nonetheless shows efforts to train TTS systems using data collected in the wild. While this approach allows for the use of massive quantities of natural speech, until now, there are no common datasets. We introduce the TTS In the Wild (TITW) dataset, the result of a fully automated pipeline, in this case, applied to the VoxCeleb1 dataset commonly used for speaker recognition. We further propose two training sets. TITW-Hard is derived from the transcription, segmentation, and selection of VoxCeleb1 source data. TITW-Easy is derived from the additional application of enhancement and additional data selection based on DNSMOS. We show that a number of recent TTS models can be trained successfully using TITW-Easy, but that it remains extremely challenging to produce similar results using TITW-Hard. Both the dataset and protocols are publicly available and support the benchmarking of TTS systems trained using TITW data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08711v1-abstract-full').style.display = 'none'; document.getElementById('2409.08711v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, submitted to ICASSP 2025 as a conference paper</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.14886">arXiv:2408.14886</a> <span> [<a href="https://arxiv.org/pdf/2408.14886">pdf</a>, <a href="https://arxiv.org/format/2408.14886">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TASLP.2024.3444456">10.1109/TASLP.2024.3444456 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> The VoxCeleb Speaker Recognition Challenge: A Retrospective </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Huh%2C+J">Jaesung Huh</a>, <a href="/search/eess?searchtype=author&query=Chung%2C+J+S">Joon Son Chung</a>, <a href="/search/eess?searchtype=author&query=Nagrani%2C+A">Arsha Nagrani</a>, <a href="/search/eess?searchtype=author&query=Brown%2C+A">Andrew Brown</a>, <a href="/search/eess?searchtype=author&query=Jung%2C+J">Jee-weon Jung</a>, <a href="/search/eess?searchtype=author&query=Garcia-Romero%2C+D">Daniel Garcia-Romero</a>, <a href="/search/eess?searchtype=author&query=Zisserman%2C+A">Andrew Zisserman</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.14886v1-abstract-short" style="display: inline;"> The VoxCeleb Speaker Recognition Challenges (VoxSRC) were a series of challenges and workshops that ran annually from 2019 to 2023. The challenges primarily evaluated the tasks of speaker recognition and diarisation under various settings including: closed and open training data; as well as supervised, self-supervised, and semi-supervised training for domain adaptation. The challenges also provide… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.14886v1-abstract-full').style.display = 'inline'; document.getElementById('2408.14886v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.14886v1-abstract-full" style="display: none;"> The VoxCeleb Speaker Recognition Challenges (VoxSRC) were a series of challenges and workshops that ran annually from 2019 to 2023. The challenges primarily evaluated the tasks of speaker recognition and diarisation under various settings including: closed and open training data; as well as supervised, self-supervised, and semi-supervised training for domain adaptation. The challenges also provided publicly available training and evaluation datasets for each task and setting, with new test sets released each year. In this paper, we provide a review of these challenges that covers: what they explored; the methods developed by the challenge participants and how these evolved; and also the current state of the field for speaker verification and diarisation. We chart the progress in performance over the five installments of the challenge on a common evaluation dataset and provide a detailed analysis of how each year's special focus affected participants' performance. This paper is aimed both at researchers who want an overview of the speaker recognition and diarisation field, and also at challenge organisers who want to benefit from the successes and avoid the mistakes of the VoxSRC challenges. We end with a discussion of the current strengths of the field and open challenges. Project page : https://mm.kaist.ac.kr/datasets/voxceleb/voxsrc/workshop.html <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.14886v1-abstract-full').style.display = 'none'; document.getElementById('2408.14886v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">TASLP 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.03593">arXiv:2408.03593</a> <span> [<a href="https://arxiv.org/pdf/2408.03593">pdf</a>, <a href="https://arxiv.org/format/2408.03593">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Bridging the Gap between Audio and Text using Parallel-attention for User-defined Keyword Spotting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Kim%2C+Y">Youkyum Kim</a>, <a href="/search/eess?searchtype=author&query=Jung%2C+J">Jaemin Jung</a>, <a href="/search/eess?searchtype=author&query=Park%2C+J">Jihwan Park</a>, <a href="/search/eess?searchtype=author&query=Kim%2C+B">Byeong-Yeol Kim</a>, <a href="/search/eess?searchtype=author&query=Chung%2C+J+S">Joon Son Chung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.03593v1-abstract-short" style="display: inline;"> This paper proposes a novel user-defined keyword spotting framework that accurately detects audio keywords based on text enrollment. Since audio data possesses additional acoustic information compared to text, there are discrepancies between these two modalities. To address this challenge, we present ParallelKWS, which utilises self- and cross-attention in a parallel architecture to effectively ca… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.03593v1-abstract-full').style.display = 'inline'; document.getElementById('2408.03593v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.03593v1-abstract-full" style="display: none;"> This paper proposes a novel user-defined keyword spotting framework that accurately detects audio keywords based on text enrollment. Since audio data possesses additional acoustic information compared to text, there are discrepancies between these two modalities. To address this challenge, we present ParallelKWS, which utilises self- and cross-attention in a parallel architecture to effectively capture information both within and across the two modalities. We further propose a phoneme duration-based alignment loss that enforces the sequential correspondence between audio and text features. Extensive experimental results demonstrate that our proposed method achieves state-of-the-art performance on several benchmark datasets in both seen and unseen domains, without incorporating extra data beyond the dataset used in previous studies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.03593v1-abstract-full').style.display = 'none'; document.getElementById('2408.03593v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This work has been submitted to the IEEE for possible publication</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.18505">arXiv:2407.18505</a> <span> [<a href="https://arxiv.org/pdf/2407.18505">pdf</a>, <a href="https://arxiv.org/format/2407.18505">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> VoxSim: A perceptual voice similarity dataset </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ahn%2C+J">Junseok Ahn</a>, <a href="/search/eess?searchtype=author&query=Kim%2C+Y">Youkyum Kim</a>, <a href="/search/eess?searchtype=author&query=Choi%2C+Y">Yeunju Choi</a>, <a href="/search/eess?searchtype=author&query=Kwak%2C+D">Doyeop Kwak</a>, <a href="/search/eess?searchtype=author&query=Kim%2C+J">Ji-Hoon Kim</a>, <a href="/search/eess?searchtype=author&query=Mun%2C+S">Seongkyu Mun</a>, <a href="/search/eess?searchtype=author&query=Chung%2C+J+S">Joon Son Chung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.18505v1-abstract-short" style="display: inline;"> This paper introduces VoxSim, a dataset of perceptual voice similarity ratings. Recent efforts to automate the assessment of speech synthesis technologies have primarily focused on predicting mean opinion score of naturalness, leaving speaker voice similarity relatively unexplored due to a lack of extensive training data. To address this, we generate about 41k utterance pairs from the VoxCeleb dat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.18505v1-abstract-full').style.display = 'inline'; document.getElementById('2407.18505v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.18505v1-abstract-full" style="display: none;"> This paper introduces VoxSim, a dataset of perceptual voice similarity ratings. Recent efforts to automate the assessment of speech synthesis technologies have primarily focused on predicting mean opinion score of naturalness, leaving speaker voice similarity relatively unexplored due to a lack of extensive training data. To address this, we generate about 41k utterance pairs from the VoxCeleb dataset, a widely utilised speech dataset for speaker recognition, and collect nearly 70k speaker similarity scores through a listening test. VoxSim offers a valuable resource for the development and benchmarking of speaker similarity prediction models. We provide baseline results of speaker similarity prediction models on the VoxSim test set and further demonstrate that the model trained on our dataset generalises to the out-of-domain VCC2018 dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.18505v1-abstract-full').style.display = 'none'; document.getElementById('2407.18505v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">INTERSPEECH 2024. The dataset is available from https://mm.kaist.ac.kr/projects/voxsim/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.13676">arXiv:2407.13676</a> <span> [<a href="https://arxiv.org/pdf/2407.13676">pdf</a>, <a href="https://arxiv.org/format/2407.13676">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Aligning Sight and Sound: Advanced Sound Source Localization Through Audio-Visual Alignment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Senocak%2C+A">Arda Senocak</a>, <a href="/search/eess?searchtype=author&query=Ryu%2C+H">Hyeonggon Ryu</a>, <a href="/search/eess?searchtype=author&query=Kim%2C+J">Junsik Kim</a>, <a href="/search/eess?searchtype=author&query=Oh%2C+T">Tae-Hyun Oh</a>, <a href="/search/eess?searchtype=author&query=Pfister%2C+H">Hanspeter Pfister</a>, <a href="/search/eess?searchtype=author&query=Chung%2C+J+S">Joon Son Chung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.13676v1-abstract-short" style="display: inline;"> Recent studies on learning-based sound source localization have mainly focused on the localization performance perspective. However, prior work and existing benchmarks overlook a crucial aspect: cross-modal interaction, which is essential for interactive sound source localization. Cross-modal interaction is vital for understanding semantically matched or mismatched audio-visual events, such as sil… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.13676v1-abstract-full').style.display = 'inline'; document.getElementById('2407.13676v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.13676v1-abstract-full" style="display: none;"> Recent studies on learning-based sound source localization have mainly focused on the localization performance perspective. However, prior work and existing benchmarks overlook a crucial aspect: cross-modal interaction, which is essential for interactive sound source localization. Cross-modal interaction is vital for understanding semantically matched or mismatched audio-visual events, such as silent objects or off-screen sounds. In this paper, we first comprehensively examine the cross-modal interaction of existing methods, benchmarks, evaluation metrics, and cross-modal understanding tasks. Then, we identify the limitations of previous studies and make several contributions to overcome the limitations. First, we introduce a new synthetic benchmark for interactive sound source localization. Second, we introduce new evaluation metrics to rigorously assess sound source localization methods, focusing on accurately evaluating both localization performance and cross-modal interaction ability. Third, we propose a learning framework with a cross-modal alignment strategy to enhance cross-modal interaction. Lastly, we evaluate both interactive sound source localization and auxiliary cross-modal retrieval tasks together to thoroughly assess cross-modal interaction capabilities and benchmark competing methods. Our new benchmarks and evaluation metrics reveal previously overlooked issues in sound source localization studies. Our proposed novel method, with enhanced cross-modal alignment, shows superior sound source localization performance. This work provides the most comprehensive analysis of sound source localization to date, with extensive validation of competing methods on both existing and new benchmarks using new and standard evaluation metrics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.13676v1-abstract-full').style.display = 'none'; document.getElementById('2407.13676v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Journal Extension of ICCV 2023 paper (arXiV:2309.10724). Code is available at https://github.com/kaistmm/SSLalignment</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.08691">arXiv:2407.08691</a> <span> [<a href="https://arxiv.org/pdf/2407.08691">pdf</a>, <a href="https://arxiv.org/format/2407.08691">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> ElasticAST: An Audio Spectrogram Transformer for All Length and Resolutions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Feng%2C+J">Jiu Feng</a>, <a href="/search/eess?searchtype=author&query=Erol%2C+M+H">Mehmet Hamza Erol</a>, <a href="/search/eess?searchtype=author&query=Chung%2C+J+S">Joon Son Chung</a>, <a href="/search/eess?searchtype=author&query=Senocak%2C+A">Arda Senocak</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.08691v1-abstract-short" style="display: inline;"> Transformers have rapidly overtaken CNN-based architectures as the new standard in audio classification. Transformer-based models, such as the Audio Spectrogram Transformers (AST), also inherit the fixed-size input paradigm from CNNs. However, this leads to performance degradation for ASTs in the inference when input lengths vary from the training. This paper introduces an approach that enables th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.08691v1-abstract-full').style.display = 'inline'; document.getElementById('2407.08691v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.08691v1-abstract-full" style="display: none;"> Transformers have rapidly overtaken CNN-based architectures as the new standard in audio classification. Transformer-based models, such as the Audio Spectrogram Transformers (AST), also inherit the fixed-size input paradigm from CNNs. However, this leads to performance degradation for ASTs in the inference when input lengths vary from the training. This paper introduces an approach that enables the use of variable-length audio inputs with AST models during both training and inference. By employing sequence packing, our method ElasticAST, accommodates any audio length during training, thereby offering flexibility across all lengths and resolutions at the inference. This flexibility allows ElasticAST to maintain evaluation capabilities at various lengths or resolutions and achieve similar performance to standard ASTs trained at specific lengths or resolutions. Moreover, experiments demonstrate ElasticAST's better performance when trained and evaluated on native-length audio datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.08691v1-abstract-full').style.display = 'none'; document.getElementById('2407.08691v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Interspeech 2024. Code is available at https://github.com/JiuFengSC/ElasticAST</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.14559">arXiv:2406.14559</a> <span> [<a href="https://arxiv.org/pdf/2406.14559">pdf</a>, <a href="https://arxiv.org/format/2406.14559">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Disentangled Representation Learning for Environment-agnostic Speaker Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Nam%2C+K">KiHyun Nam</a>, <a href="/search/eess?searchtype=author&query=Heo%2C+H">Hee-Soo Heo</a>, <a href="/search/eess?searchtype=author&query=Jung%2C+J">Jee-weon Jung</a>, <a href="/search/eess?searchtype=author&query=Chung%2C+J+S">Joon Son Chung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.14559v1-abstract-short" style="display: inline;"> This work presents a framework based on feature disentanglement to learn speaker embeddings that are robust to environmental variations. Our framework utilises an auto-encoder as a disentangler, dividing the input speaker embedding into components related to the speaker and other residual information. We employ a group of objective functions to ensure that the auto-encoder's code representation -… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.14559v1-abstract-full').style.display = 'inline'; document.getElementById('2406.14559v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.14559v1-abstract-full" style="display: none;"> This work presents a framework based on feature disentanglement to learn speaker embeddings that are robust to environmental variations. Our framework utilises an auto-encoder as a disentangler, dividing the input speaker embedding into components related to the speaker and other residual information. We employ a group of objective functions to ensure that the auto-encoder's code representation - used as the refined embedding - condenses only the speaker characteristics. We show the versatility of our framework through its compatibility with any existing speaker embedding extractor, requiring no structural modifications or adaptations for integration. We validate the effectiveness of our framework by incorporating it into two popularly used embedding extractors and conducting experiments across various benchmarks. The results show a performance improvement of up to 16%. We release our code for this work to be available https://github.com/kaistmm/voxceleb-disentangler <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.14559v1-abstract-full').style.display = 'none'; document.getElementById('2406.14559v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Interspeech 2024. The official webpage can be found at https://mm.kaist.ac.kr/projects/voxceleb-disentangler/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.10549">arXiv:2406.10549</a> <span> [<a href="https://arxiv.org/pdf/2406.10549">pdf</a>, <a href="https://arxiv.org/format/2406.10549">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Lightweight Audio Segmentation for Long-form Speech Translation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Lee%2C+J">Jaesong Lee</a>, <a href="/search/eess?searchtype=author&query=Kim%2C+S">Soyoon Kim</a>, <a href="/search/eess?searchtype=author&query=Kim%2C+H">Hanbyul Kim</a>, <a href="/search/eess?searchtype=author&query=Chung%2C+J+S">Joon Son Chung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.10549v1-abstract-short" style="display: inline;"> Speech segmentation is an essential part of speech translation (ST) systems in real-world scenarios. Since most ST models are designed to process speech segments, long-form audio must be partitioned into shorter segments before translation. Recently, data-driven approaches for the speech segmentation task have been developed. Although the approaches improve overall translation quality, a performan… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.10549v1-abstract-full').style.display = 'inline'; document.getElementById('2406.10549v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.10549v1-abstract-full" style="display: none;"> Speech segmentation is an essential part of speech translation (ST) systems in real-world scenarios. Since most ST models are designed to process speech segments, long-form audio must be partitioned into shorter segments before translation. Recently, data-driven approaches for the speech segmentation task have been developed. Although the approaches improve overall translation quality, a performance gap exists due to a mismatch between the models and ST systems. In addition, the prior works require large self-supervised speech models, which consume significant computational resources. In this work, we propose a segmentation model that achieves better speech translation quality with a small model size. We propose an ASR-with-punctuation task as an effective pre-training strategy for the segmentation model. We also show that proper integration of the speech segmentation model into the underlying ST system is critical to improve overall translation quality at inference time. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.10549v1-abstract-full').style.display = 'none'; document.getElementById('2406.10549v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to Interspeech 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.09286">arXiv:2406.09286</a> <span> [<a href="https://arxiv.org/pdf/2406.09286">pdf</a>, <a href="https://arxiv.org/format/2406.09286">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> FlowAVSE: Efficient Audio-Visual Speech Enhancement with Conditional Flow Matching </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Jung%2C+C">Chaeyoung Jung</a>, <a href="/search/eess?searchtype=author&query=Lee%2C+S">Suyeon Lee</a>, <a href="/search/eess?searchtype=author&query=Kim%2C+J">Ji-Hoon Kim</a>, <a href="/search/eess?searchtype=author&query=Chung%2C+J+S">Joon Son Chung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.09286v1-abstract-short" style="display: inline;"> This work proposes an efficient method to enhance the quality of corrupted speech signals by leveraging both acoustic and visual cues. While existing diffusion-based approaches have demonstrated remarkable quality, their applicability is limited by slow inference speeds and computational complexity. To address this issue, we present FlowAVSE which enhances the inference speed and reduces the numbe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.09286v1-abstract-full').style.display = 'inline'; document.getElementById('2406.09286v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.09286v1-abstract-full" style="display: none;"> This work proposes an efficient method to enhance the quality of corrupted speech signals by leveraging both acoustic and visual cues. While existing diffusion-based approaches have demonstrated remarkable quality, their applicability is limited by slow inference speeds and computational complexity. To address this issue, we present FlowAVSE which enhances the inference speed and reduces the number of learnable parameters without degrading the output quality. In particular, we employ a conditional flow matching algorithm that enables the generation of high-quality speech in a single sampling step. Moreover, we increase efficiency by optimizing the underlying U-net architecture of diffusion-based systems. Our experiments demonstrate that FlowAVSE achieves 22 times faster inference speed and reduces the model size by half while maintaining the output quality. The demo page is available at: https://cyongong.github.io/FlowAVSE.github.io/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.09286v1-abstract-full').style.display = 'none'; document.getElementById('2406.09286v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">INTERSPEECH 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.05339">arXiv:2406.05339</a> <span> [<a href="https://arxiv.org/pdf/2406.05339">pdf</a>, <a href="https://arxiv.org/format/2406.05339">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> To what extent can ASV systems naturally defend against spoofing attacks? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Jung%2C+J">Jee-weon Jung</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+X">Xin Wang</a>, <a href="/search/eess?searchtype=author&query=Evans%2C+N">Nicholas Evans</a>, <a href="/search/eess?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a>, <a href="/search/eess?searchtype=author&query=Shim%2C+H">Hye-jin Shim</a>, <a href="/search/eess?searchtype=author&query=Tak%2C+H">Hemlata Tak</a>, <a href="/search/eess?searchtype=author&query=Arora%2C+S">Sidhhant Arora</a>, <a href="/search/eess?searchtype=author&query=Yamagishi%2C+J">Junichi Yamagishi</a>, <a href="/search/eess?searchtype=author&query=Chung%2C+J+S">Joon Son Chung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.05339v3-abstract-short" style="display: inline;"> The current automatic speaker verification (ASV) task involves making binary decisions on two types of trials: target and non-target. However, emerging advancements in speech generation technology pose significant threats to the reliability of ASV systems. This study investigates whether ASV effortlessly acquires robustness against spoofing attacks (i.e., zero-shot capability) by systematically ex… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.05339v3-abstract-full').style.display = 'inline'; document.getElementById('2406.05339v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.05339v3-abstract-full" style="display: none;"> The current automatic speaker verification (ASV) task involves making binary decisions on two types of trials: target and non-target. However, emerging advancements in speech generation technology pose significant threats to the reliability of ASV systems. This study investigates whether ASV effortlessly acquires robustness against spoofing attacks (i.e., zero-shot capability) by systematically exploring diverse ASV systems and spoofing attacks, ranging from traditional to cutting-edge techniques. Through extensive analyses conducted on eight distinct ASV systems and 29 spoofing attack systems, we demonstrate that the evolution of ASV inherently incorporates defense mechanisms against spoofing attacks. Nevertheless, our findings also underscore that the advancement of spoofing attacks far outpaces that of ASV systems, hence necessitating further research on spoofing-robust ASV methodologies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.05339v3-abstract-full').style.display = 'none'; document.getElementById('2406.05339v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 3 figures, 3 tables, Interspeech 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.03344">arXiv:2406.03344</a> <span> [<a href="https://arxiv.org/pdf/2406.03344">pdf</a>, <a href="https://arxiv.org/format/2406.03344">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Audio Mamba: Bidirectional State Space Model for Audio Representation Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Erol%2C+M+H">Mehmet Hamza Erol</a>, <a href="/search/eess?searchtype=author&query=Senocak%2C+A">Arda Senocak</a>, <a href="/search/eess?searchtype=author&query=Feng%2C+J">Jiu Feng</a>, <a href="/search/eess?searchtype=author&query=Chung%2C+J+S">Joon Son Chung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.03344v1-abstract-short" style="display: inline;"> Transformers have rapidly become the preferred choice for audio classification, surpassing methods based on CNNs. However, Audio Spectrogram Transformers (ASTs) exhibit quadratic scaling due to self-attention. The removal of this quadratic self-attention cost presents an appealing direction. Recently, state space models (SSMs), such as Mamba, have demonstrated potential in language and vision task… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.03344v1-abstract-full').style.display = 'inline'; document.getElementById('2406.03344v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.03344v1-abstract-full" style="display: none;"> Transformers have rapidly become the preferred choice for audio classification, surpassing methods based on CNNs. However, Audio Spectrogram Transformers (ASTs) exhibit quadratic scaling due to self-attention. The removal of this quadratic self-attention cost presents an appealing direction. Recently, state space models (SSMs), such as Mamba, have demonstrated potential in language and vision tasks in this regard. In this study, we explore whether reliance on self-attention is necessary for audio classification tasks. By introducing Audio Mamba (AuM), the first self-attention-free, purely SSM-based model for audio classification, we aim to address this question. We evaluate AuM on various audio datasets - comprising six different benchmarks - where it achieves comparable or better performance compared to well-established AST model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.03344v1-abstract-full').style.display = 'none'; document.getElementById('2406.03344v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Code is available at https://github.com/mhamzaerol/Audio-Mamba-AuM</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.10272">arXiv:2405.10272</a> <span> [<a href="https://arxiv.org/pdf/2405.10272">pdf</a>, <a href="https://arxiv.org/format/2405.10272">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Faces that Speak: Jointly Synthesising Talking Face and Speech from Text </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Jang%2C+Y">Youngjoon Jang</a>, <a href="/search/eess?searchtype=author&query=Kim%2C+J">Ji-Hoon Kim</a>, <a href="/search/eess?searchtype=author&query=Ahn%2C+J">Junseok Ahn</a>, <a href="/search/eess?searchtype=author&query=Kwak%2C+D">Doyeop Kwak</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+H">Hong-Sun Yang</a>, <a href="/search/eess?searchtype=author&query=Ju%2C+Y">Yoon-Cheol Ju</a>, <a href="/search/eess?searchtype=author&query=Kim%2C+I">Il-Hwan Kim</a>, <a href="/search/eess?searchtype=author&query=Kim%2C+B">Byeong-Yeol Kim</a>, <a href="/search/eess?searchtype=author&query=Chung%2C+J+S">Joon Son Chung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.10272v1-abstract-short" style="display: inline;"> The goal of this work is to simultaneously generate natural talking faces and speech outputs from text. We achieve this by integrating Talking Face Generation (TFG) and Text-to-Speech (TTS) systems into a unified framework. We address the main challenges of each task: (1) generating a range of head poses representative of real-world scenarios, and (2) ensuring voice consistency despite variations… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.10272v1-abstract-full').style.display = 'inline'; document.getElementById('2405.10272v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.10272v1-abstract-full" style="display: none;"> The goal of this work is to simultaneously generate natural talking faces and speech outputs from text. We achieve this by integrating Talking Face Generation (TFG) and Text-to-Speech (TTS) systems into a unified framework. We address the main challenges of each task: (1) generating a range of head poses representative of real-world scenarios, and (2) ensuring voice consistency despite variations in facial motion for the same identity. To tackle these issues, we introduce a motion sampler based on conditional flow matching, which is capable of high-quality motion code generation in an efficient way. Moreover, we introduce a novel conditioning method for the TTS system, which utilises motion-removed features from the TFG model to yield uniform speech outputs. Our extensive experiments demonstrate that our method effectively creates natural-looking talking faces and speech that accurately match the input text. To our knowledge, this is the first effort to build a multimodal synthesis system that can generalise to unseen identities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.10272v1-abstract-full').style.display = 'none'; document.getElementById('2405.10272v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CVPR 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.10032">arXiv:2401.10032</a> <span> [<a href="https://arxiv.org/pdf/2401.10032">pdf</a>, <a href="https://arxiv.org/format/2401.10032">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> FreGrad: Lightweight and Fast Frequency-aware Diffusion Vocoder </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Nguyen%2C+T+D">Tan Dat Nguyen</a>, <a href="/search/eess?searchtype=author&query=Kim%2C+J">Ji-Hoon Kim</a>, <a href="/search/eess?searchtype=author&query=Jang%2C+Y">Youngjoon Jang</a>, <a href="/search/eess?searchtype=author&query=Kim%2C+J">Jaehun Kim</a>, <a href="/search/eess?searchtype=author&query=Chung%2C+J+S">Joon Son Chung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.10032v1-abstract-short" style="display: inline;"> The goal of this paper is to generate realistic audio with a lightweight and fast diffusion-based vocoder, named FreGrad. Our framework consists of the following three key components: (1) We employ discrete wavelet transform that decomposes a complicated waveform into sub-band wavelets, which helps FreGrad to operate on a simple and concise feature space, (2) We design a frequency-aware dilated co… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.10032v1-abstract-full').style.display = 'inline'; document.getElementById('2401.10032v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.10032v1-abstract-full" style="display: none;"> The goal of this paper is to generate realistic audio with a lightweight and fast diffusion-based vocoder, named FreGrad. Our framework consists of the following three key components: (1) We employ discrete wavelet transform that decomposes a complicated waveform into sub-band wavelets, which helps FreGrad to operate on a simple and concise feature space, (2) We design a frequency-aware dilated convolution that elevates frequency awareness, resulting in generating speech with accurate frequency information, and (3) We introduce a bag of tricks that boosts the generation quality of the proposed model. In our experiments, FreGrad achieves 3.7 times faster training time and 2.2 times faster inference speed compared to our baseline while reducing the model size by 0.6 times (only 1.78M parameters) without sacrificing the output quality. Audio samples are available at: https://mm.kaist.ac.kr/projects/FreGrad. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.10032v1-abstract-full').style.display = 'none'; document.getElementById('2401.10032v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ICASSP 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.08415">arXiv:2401.08415</a> <span> [<a href="https://arxiv.org/pdf/2401.08415">pdf</a>, <a href="https://arxiv.org/format/2401.08415">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> From Coarse to Fine: Efficient Training for Audio Spectrogram Transformers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Feng%2C+J">Jiu Feng</a>, <a href="/search/eess?searchtype=author&query=Erol%2C+M+H">Mehmet Hamza Erol</a>, <a href="/search/eess?searchtype=author&query=Chung%2C+J+S">Joon Son Chung</a>, <a href="/search/eess?searchtype=author&query=Senocak%2C+A">Arda Senocak</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.08415v1-abstract-short" style="display: inline;"> Transformers have become central to recent advances in audio classification. However, training an audio spectrogram transformer, e.g. AST, from scratch can be resource and time-intensive. Furthermore, the complexity of transformers heavily depends on the input audio spectrogram size. In this work, we aim to optimize AST training by linking to the resolution in the time-axis. We introduce multi-pha… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.08415v1-abstract-full').style.display = 'inline'; document.getElementById('2401.08415v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.08415v1-abstract-full" style="display: none;"> Transformers have become central to recent advances in audio classification. However, training an audio spectrogram transformer, e.g. AST, from scratch can be resource and time-intensive. Furthermore, the complexity of transformers heavily depends on the input audio spectrogram size. In this work, we aim to optimize AST training by linking to the resolution in the time-axis. We introduce multi-phase training of audio spectrogram transformers by connecting the seminal idea of coarse-to-fine with transformer models. To achieve this, we propose a set of methods for temporal compression. By employing one of these methods, the transformer model learns from lower-resolution (coarse) data in the initial phases, and then is fine-tuned with high-resolution data later in a curriculum learning strategy. Experimental results demonstrate that the proposed training mechanism for AST leads to improved (or on-par) performance with faster convergence, i.e. requiring fewer computational resources and less time. This approach is also generalizable to other AST-based methods regardless of their learning paradigms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.08415v1-abstract-full').style.display = 'none'; document.getElementById('2401.08415v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICASSP 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.04066">arXiv:2311.04066</a> <span> [<a href="https://arxiv.org/pdf/2311.04066">pdf</a>, <a href="https://arxiv.org/format/2311.04066">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Can CLIP Help Sound Source Localization? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Park%2C+S">Sooyoung Park</a>, <a href="/search/eess?searchtype=author&query=Senocak%2C+A">Arda Senocak</a>, <a href="/search/eess?searchtype=author&query=Chung%2C+J+S">Joon Son Chung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.04066v1-abstract-short" style="display: inline;"> Large-scale pre-trained image-text models demonstrate remarkable versatility across diverse tasks, benefiting from their robust representational capabilities and effective multimodal alignment. We extend the application of these models, specifically CLIP, to the domain of sound source localization. Unlike conventional approaches, we employ the pre-trained CLIP model without explicit text input, re… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.04066v1-abstract-full').style.display = 'inline'; document.getElementById('2311.04066v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.04066v1-abstract-full" style="display: none;"> Large-scale pre-trained image-text models demonstrate remarkable versatility across diverse tasks, benefiting from their robust representational capabilities and effective multimodal alignment. We extend the application of these models, specifically CLIP, to the domain of sound source localization. Unlike conventional approaches, we employ the pre-trained CLIP model without explicit text input, relying solely on the audio-visual correspondence. To this end, we introduce a framework that translates audio signals into tokens compatible with CLIP's text encoder, yielding audio-driven embeddings. By directly using these embeddings, our method generates audio-grounded masks for the provided audio, extracts audio-grounded image features from the highlighted regions, and aligns them with the audio-driven embeddings using the audio-visual correspondence objective. Our findings suggest that utilizing pre-trained image-text models enable our model to generate more complete and compact localization maps for the sounding objects. Extensive experiments show that our method outperforms state-of-the-art approaches by a significant margin. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.04066v1-abstract-full').style.display = 'none'; document.getElementById('2311.04066v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">WACV 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.19581">arXiv:2310.19581</a> <span> [<a href="https://arxiv.org/pdf/2310.19581">pdf</a>, <a href="https://arxiv.org/format/2310.19581">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Seeing Through the Conversation: Audio-Visual Speech Separation based on Diffusion Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Lee%2C+S">Suyeon Lee</a>, <a href="/search/eess?searchtype=author&query=Jung%2C+C">Chaeyoung Jung</a>, <a href="/search/eess?searchtype=author&query=Jang%2C+Y">Youngjoon Jang</a>, <a href="/search/eess?searchtype=author&query=Kim%2C+J">Jaehun Kim</a>, <a href="/search/eess?searchtype=author&query=Chung%2C+J+S">Joon Son Chung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.19581v1-abstract-short" style="display: inline;"> The objective of this work is to extract target speaker's voice from a mixture of voices using visual cues. Existing works on audio-visual speech separation have demonstrated their performance with promising intelligibility, but maintaining naturalness remains a challenge. To address this issue, we propose AVDiffuSS, an audio-visual speech separation model based on a diffusion mechanism known for… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.19581v1-abstract-full').style.display = 'inline'; document.getElementById('2310.19581v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.19581v1-abstract-full" style="display: none;"> The objective of this work is to extract target speaker's voice from a mixture of voices using visual cues. Existing works on audio-visual speech separation have demonstrated their performance with promising intelligibility, but maintaining naturalness remains a challenge. To address this issue, we propose AVDiffuSS, an audio-visual speech separation model based on a diffusion mechanism known for its capability in generating natural samples. For an effective fusion of the two modalities for diffusion, we also propose a cross-attention-based feature fusion mechanism. This mechanism is specifically tailored for the speech domain to integrate the phonetic information from audio-visual correspondence in speech generation. In this way, the fusion process maintains the high temporal resolution of the features, without excessive computational requirements. We demonstrate that the proposed framework achieves state-of-the-art results on two benchmarks, including VoxCeleb2 and LRS3, producing speech with notably better naturalness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.19581v1-abstract-full').style.display = 'none'; document.getElementById('2310.19581v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page with demo: https://mm.kaist.ac.kr/projects/avdiffuss/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.14741">arXiv:2309.14741</a> <span> [<a href="https://arxiv.org/pdf/2309.14741">pdf</a>, <a href="https://arxiv.org/format/2309.14741">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Rethinking Session Variability: Leveraging Session Embeddings for Session Robustness in Speaker Verification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Heo%2C+H">Hee-Soo Heo</a>, <a href="/search/eess?searchtype=author&query=Nam%2C+K">KiHyun Nam</a>, <a href="/search/eess?searchtype=author&query=Lee%2C+B">Bong-Jin Lee</a>, <a href="/search/eess?searchtype=author&query=Kwon%2C+Y">Youngki Kwon</a>, <a href="/search/eess?searchtype=author&query=Lee%2C+M">Minjae Lee</a>, <a href="/search/eess?searchtype=author&query=Kim%2C+Y+J">You Jin Kim</a>, <a href="/search/eess?searchtype=author&query=Chung%2C+J+S">Joon Son Chung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.14741v1-abstract-short" style="display: inline;"> In the field of speaker verification, session or channel variability poses a significant challenge. While many contemporary methods aim to disentangle session information from speaker embeddings, we introduce a novel approach using an additional embedding to represent the session information. This is achieved by training an auxiliary network appended to the speaker embedding extractor which remain… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.14741v1-abstract-full').style.display = 'inline'; document.getElementById('2309.14741v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.14741v1-abstract-full" style="display: none;"> In the field of speaker verification, session or channel variability poses a significant challenge. While many contemporary methods aim to disentangle session information from speaker embeddings, we introduce a novel approach using an additional embedding to represent the session information. This is achieved by training an auxiliary network appended to the speaker embedding extractor which remains fixed in this training process. This results in two similarity scores: one for the speakers information and one for the session information. The latter score acts as a compensator for the former that might be skewed due to session variations. Our extensive experiments demonstrate that session information can be effectively compensated without retraining of the embedding extractor. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.14741v1-abstract-full').style.display = 'none'; document.getElementById('2309.14741v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.13664">arXiv:2309.13664</a> <span> [<a href="https://arxiv.org/pdf/2309.13664">pdf</a>, <a href="https://arxiv.org/format/2309.13664">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> VoiceLDM: Text-to-Speech with Environmental Context </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Lee%2C+Y">Yeonghyeon Lee</a>, <a href="/search/eess?searchtype=author&query=Yeon%2C+I">Inmo Yeon</a>, <a href="/search/eess?searchtype=author&query=Nam%2C+J">Juhan Nam</a>, <a href="/search/eess?searchtype=author&query=Chung%2C+J+S">Joon Son Chung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.13664v1-abstract-short" style="display: inline;"> This paper presents VoiceLDM, a model designed to produce audio that accurately follows two distinct natural language text prompts: the description prompt and the content prompt. The former provides information about the overall environmental context of the audio, while the latter conveys the linguistic content. To achieve this, we adopt a text-to-audio (TTA) model based on latent diffusion models… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.13664v1-abstract-full').style.display = 'inline'; document.getElementById('2309.13664v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.13664v1-abstract-full" style="display: none;"> This paper presents VoiceLDM, a model designed to produce audio that accurately follows two distinct natural language text prompts: the description prompt and the content prompt. The former provides information about the overall environmental context of the audio, while the latter conveys the linguistic content. To achieve this, we adopt a text-to-audio (TTA) model based on latent diffusion models and extend its functionality to incorporate an additional content prompt as a conditional input. By utilizing pretrained contrastive language-audio pretraining (CLAP) and Whisper, VoiceLDM is trained on large amounts of real-world audio without manual annotations or transcriptions. Additionally, we employ dual classifier-free guidance to further enhance the controllability of VoiceLDM. Experimental results demonstrate that VoiceLDM is capable of generating plausible audio that aligns well with both input conditions, even surpassing the speech intelligibility of the ground truth audio on the AudioCaps test set. Furthermore, we explore the text-to-speech (TTS) and zero-shot text-to-audio capabilities of VoiceLDM and show that it achieves competitive results. Demos and code are available at https://voiceldm.github.io. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.13664v1-abstract-full').style.display = 'none'; document.getElementById('2309.13664v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Demos and code are available at https://voiceldm.github.io</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.12306">arXiv:2309.12306</a> <span> [<a href="https://arxiv.org/pdf/2309.12306">pdf</a>, <a href="https://arxiv.org/format/2309.12306">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> TalkNCE: Improving Active Speaker Detection with Talk-Aware Contrastive Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Jung%2C+C">Chaeyoung Jung</a>, <a href="/search/eess?searchtype=author&query=Lee%2C+S">Suyeon Lee</a>, <a href="/search/eess?searchtype=author&query=Nam%2C+K">Kihyun Nam</a>, <a href="/search/eess?searchtype=author&query=Rho%2C+K">Kyeongha Rho</a>, <a href="/search/eess?searchtype=author&query=Kim%2C+Y+J">You Jin Kim</a>, <a href="/search/eess?searchtype=author&query=Jang%2C+Y">Youngjoon Jang</a>, <a href="/search/eess?searchtype=author&query=Chung%2C+J+S">Joon Son Chung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.12306v1-abstract-short" style="display: inline;"> The goal of this work is Active Speaker Detection (ASD), a task to determine whether a person is speaking or not in a series of video frames. Previous works have dealt with the task by exploring network architectures while learning effective representations has been less explored. In this work, we propose TalkNCE, a novel talk-aware contrastive loss. The loss is only applied to part of the full se… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.12306v1-abstract-full').style.display = 'inline'; document.getElementById('2309.12306v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.12306v1-abstract-full" style="display: none;"> The goal of this work is Active Speaker Detection (ASD), a task to determine whether a person is speaking or not in a series of video frames. Previous works have dealt with the task by exploring network architectures while learning effective representations has been less explored. In this work, we propose TalkNCE, a novel talk-aware contrastive loss. The loss is only applied to part of the full segments where a person on the screen is actually speaking. This encourages the model to learn effective representations through the natural correspondence of speech and facial movements. Our loss can be jointly optimized with the existing objectives for training ASD models without the need for additional supervision or training data. The experiments demonstrate that our loss can be easily integrated into the existing ASD frameworks, improving their performance. Our method achieves state-of-the-art performances on AVA-ActiveSpeaker and ASW datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.12306v1-abstract-full').style.display = 'none'; document.getElementById('2309.12306v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.10724">arXiv:2309.10724</a> <span> [<a href="https://arxiv.org/pdf/2309.10724">pdf</a>, <a href="https://arxiv.org/format/2309.10724">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Sound Source Localization is All about Cross-Modal Alignment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Senocak%2C+A">Arda Senocak</a>, <a href="/search/eess?searchtype=author&query=Ryu%2C+H">Hyeonggon Ryu</a>, <a href="/search/eess?searchtype=author&query=Kim%2C+J">Junsik Kim</a>, <a href="/search/eess?searchtype=author&query=Oh%2C+T">Tae-Hyun Oh</a>, <a href="/search/eess?searchtype=author&query=Pfister%2C+H">Hanspeter Pfister</a>, <a href="/search/eess?searchtype=author&query=Chung%2C+J+S">Joon Son Chung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.10724v1-abstract-short" style="display: inline;"> Humans can easily perceive the direction of sound sources in a visual scene, termed sound source localization. Recent studies on learning-based sound source localization have mainly explored the problem from a localization perspective. However, prior arts and existing benchmarks do not account for a more important aspect of the problem, cross-modal semantic understanding, which is essential for ge… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.10724v1-abstract-full').style.display = 'inline'; document.getElementById('2309.10724v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.10724v1-abstract-full" style="display: none;"> Humans can easily perceive the direction of sound sources in a visual scene, termed sound source localization. Recent studies on learning-based sound source localization have mainly explored the problem from a localization perspective. However, prior arts and existing benchmarks do not account for a more important aspect of the problem, cross-modal semantic understanding, which is essential for genuine sound source localization. Cross-modal semantic understanding is important in understanding semantically mismatched audio-visual events, e.g., silent objects, or off-screen sounds. To account for this, we propose a cross-modal alignment task as a joint task with sound source localization to better learn the interaction between audio and visual modalities. Thereby, we achieve high localization performance with strong cross-modal semantic understanding. Our method outperforms the state-of-the-art approaches in both sound source localization and cross-modal retrieval. Our work suggests that jointly tackling both tasks is necessary to conquer genuine sound source localization. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.10724v1-abstract-full').style.display = 'none'; document.getElementById('2309.10724v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICCV 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.15256">arXiv:2308.15256</a> <span> [<a href="https://arxiv.org/pdf/2308.15256">pdf</a>, <a href="https://arxiv.org/format/2308.15256">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Let There Be Sound: Reconstructing High Quality Speech from Silent Videos </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Kim%2C+J">Ji-Hoon Kim</a>, <a href="/search/eess?searchtype=author&query=Kim%2C+J">Jaehun Kim</a>, <a href="/search/eess?searchtype=author&query=Chung%2C+J+S">Joon Son Chung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.15256v2-abstract-short" style="display: inline;"> The goal of this work is to reconstruct high quality speech from lip motions alone, a task also known as lip-to-speech. A key challenge of lip-to-speech systems is the one-to-many mapping caused by (1) the existence of homophenes and (2) multiple speech variations, resulting in a mispronounced and over-smoothed speech. In this paper, we propose a novel lip-to-speech system that significantly impro… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.15256v2-abstract-full').style.display = 'inline'; document.getElementById('2308.15256v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.15256v2-abstract-full" style="display: none;"> The goal of this work is to reconstruct high quality speech from lip motions alone, a task also known as lip-to-speech. A key challenge of lip-to-speech systems is the one-to-many mapping caused by (1) the existence of homophenes and (2) multiple speech variations, resulting in a mispronounced and over-smoothed speech. In this paper, we propose a novel lip-to-speech system that significantly improves the generation quality by alleviating the one-to-many mapping problem from multiple perspectives. Specifically, we incorporate (1) self-supervised speech representations to disambiguate homophenes, and (2) acoustic variance information to model diverse speech styles. Additionally, to better solve the aforementioned problem, we employ a flow based post-net which captures and refines the details of the generated speech. We perform extensive experiments on two datasets, and demonstrate that our method achieves the generation quality close to that of real human utterance, outperforming existing methods in terms of speech naturalness and intelligibility by a large margin. Synthesised samples are available at our demo page: https://mm.kaist.ac.kr/projects/LTBS. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.15256v2-abstract-full').style.display = 'none'; document.getElementById('2308.15256v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to AAAI 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2307.09286">arXiv:2307.09286</a> <span> [<a href="https://arxiv.org/pdf/2307.09286">pdf</a>, <a href="https://arxiv.org/format/2307.09286">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> FlexiAST: Flexibility is What AST Needs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Feng%2C+J">Jiu Feng</a>, <a href="/search/eess?searchtype=author&query=Erol%2C+M+H">Mehmet Hamza Erol</a>, <a href="/search/eess?searchtype=author&query=Chung%2C+J+S">Joon Son Chung</a>, <a href="/search/eess?searchtype=author&query=Senocak%2C+A">Arda Senocak</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2307.09286v1-abstract-short" style="display: inline;"> The objective of this work is to give patch-size flexibility to Audio Spectrogram Transformers (AST). Recent advancements in ASTs have shown superior performance in various audio-based tasks. However, the performance of standard ASTs degrades drastically when evaluated using different patch sizes from that used during training. As a result, AST models are typically re-trained to accommodate change… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.09286v1-abstract-full').style.display = 'inline'; document.getElementById('2307.09286v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2307.09286v1-abstract-full" style="display: none;"> The objective of this work is to give patch-size flexibility to Audio Spectrogram Transformers (AST). Recent advancements in ASTs have shown superior performance in various audio-based tasks. However, the performance of standard ASTs degrades drastically when evaluated using different patch sizes from that used during training. As a result, AST models are typically re-trained to accommodate changes in patch sizes. To overcome this limitation, this paper proposes a training procedure to provide flexibility to standard AST models without architectural changes, allowing them to work with various patch sizes at the inference stage - FlexiAST. This proposed training approach simply utilizes random patch size selection and resizing of patch and positional embedding weights. Our experiments show that FlexiAST gives similar performance to standard AST models while maintaining its evaluation ability at various patch sizes on different datasets for audio classification tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.09286v1-abstract-full').style.display = 'none'; document.getElementById('2307.09286v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Interspeech 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2303.17517">arXiv:2303.17517</a> <span> [<a href="https://arxiv.org/pdf/2303.17517">pdf</a>, <a href="https://arxiv.org/format/2303.17517">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Hindi as a Second Language: Improving Visually Grounded Speech with Semantically Similar Samples </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ryu%2C+H">Hyeonggon Ryu</a>, <a href="/search/eess?searchtype=author&query=Senocak%2C+A">Arda Senocak</a>, <a href="/search/eess?searchtype=author&query=Kweon%2C+I+S">In So Kweon</a>, <a href="/search/eess?searchtype=author&query=Chung%2C+J+S">Joon Son Chung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2303.17517v1-abstract-short" style="display: inline;"> The objective of this work is to explore the learning of visually grounded speech models (VGS) from multilingual perspective. Bilingual VGS models are generally trained with an equal number of spoken captions from both languages. However, in reality, there can be an imbalance among the languages for the available spoken captions. Our key contribution in this work is to leverage the power of a high… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.17517v1-abstract-full').style.display = 'inline'; document.getElementById('2303.17517v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2303.17517v1-abstract-full" style="display: none;"> The objective of this work is to explore the learning of visually grounded speech models (VGS) from multilingual perspective. Bilingual VGS models are generally trained with an equal number of spoken captions from both languages. However, in reality, there can be an imbalance among the languages for the available spoken captions. Our key contribution in this work is to leverage the power of a high-resource language in a bilingual visually grounded speech model to improve the performance of a low-resource language. We introduce two methods to distill the knowledge of high-resource language into low-resource languages: (1) incorporating a strong pre-trained high-resource language encoder and (2) using semantically similar spoken captions. Our experiments show that combining these two approaches effectively enables the low-resource language to surpass the performances of monolingual and bilingual counterparts for cross-modal retrieval tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.17517v1-abstract-full').style.display = 'none'; document.getElementById('2303.17517v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICASSP 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2302.13700">arXiv:2302.13700</a> <span> [<a href="https://arxiv.org/pdf/2302.13700">pdf</a>, <a href="https://arxiv.org/format/2302.13700">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Imaginary Voice: Face-styled Diffusion Model for Text-to-Speech </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Lee%2C+J">Jiyoung Lee</a>, <a href="/search/eess?searchtype=author&query=Chung%2C+J+S">Joon Son Chung</a>, <a href="/search/eess?searchtype=author&query=Chung%2C+S">Soo-Whan Chung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2302.13700v1-abstract-short" style="display: inline;"> The goal of this work is zero-shot text-to-speech synthesis, with speaking styles and voices learnt from facial characteristics. Inspired by the natural fact that people can imagine the voice of someone when they look at his or her face, we introduce a face-styled diffusion text-to-speech (TTS) model within a unified framework learnt from visible attributes, called Face-TTS. This is the first time… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.13700v1-abstract-full').style.display = 'inline'; document.getElementById('2302.13700v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2302.13700v1-abstract-full" style="display: none;"> The goal of this work is zero-shot text-to-speech synthesis, with speaking styles and voices learnt from facial characteristics. Inspired by the natural fact that people can imagine the voice of someone when they look at his or her face, we introduce a face-styled diffusion text-to-speech (TTS) model within a unified framework learnt from visible attributes, called Face-TTS. This is the first time that face images are used as a condition to train a TTS model. We jointly train cross-model biometrics and TTS models to preserve speaker identity between face images and generated speech segments. We also propose a speaker feature binding loss to enforce the similarity of the generated and the ground truth speech segments in speaker embedding space. Since the biometric information is extracted directly from the face image, our method does not require extra fine-tuning steps to generate speech from unseen and unheard speakers. We train and evaluate the model on the LRS3 dataset, an in-the-wild audio-visual corpus containing background noise and diverse speaking styles. The project page is https://facetts.github.io. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.13700v1-abstract-full').style.display = 'none'; document.getElementById('2302.13700v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 February, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICASSP 2023. Project page: https://facetts.github.io</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2302.10248">arXiv:2302.10248</a> <span> [<a href="https://arxiv.org/pdf/2302.10248">pdf</a>, <a href="https://arxiv.org/ps/2302.10248">ps</a>, <a href="https://arxiv.org/format/2302.10248">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> VoxSRC 2022: The Fourth VoxCeleb Speaker Recognition Challenge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Huh%2C+J">Jaesung Huh</a>, <a href="/search/eess?searchtype=author&query=Brown%2C+A">Andrew Brown</a>, <a href="/search/eess?searchtype=author&query=Jung%2C+J">Jee-weon Jung</a>, <a href="/search/eess?searchtype=author&query=Chung%2C+J+S">Joon Son Chung</a>, <a href="/search/eess?searchtype=author&query=Nagrani%2C+A">Arsha Nagrani</a>, <a href="/search/eess?searchtype=author&query=Garcia-Romero%2C+D">Daniel Garcia-Romero</a>, <a href="/search/eess?searchtype=author&query=Zisserman%2C+A">Andrew Zisserman</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2302.10248v2-abstract-short" style="display: inline;"> This paper summarises the findings from the VoxCeleb Speaker Recognition Challenge 2022 (VoxSRC-22), which was held in conjunction with INTERSPEECH 2022. The goal of this challenge was to evaluate how well state-of-the-art speaker recognition systems can diarise and recognise speakers from speech obtained "in the wild". The challenge consisted of: (i) the provision of publicly available speaker re… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.10248v2-abstract-full').style.display = 'inline'; document.getElementById('2302.10248v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2302.10248v2-abstract-full" style="display: none;"> This paper summarises the findings from the VoxCeleb Speaker Recognition Challenge 2022 (VoxSRC-22), which was held in conjunction with INTERSPEECH 2022. The goal of this challenge was to evaluate how well state-of-the-art speaker recognition systems can diarise and recognise speakers from speech obtained "in the wild". The challenge consisted of: (i) the provision of publicly available speaker recognition and diarisation data from YouTube videos together with ground truth annotation and standardised evaluation software; and (ii) a public challenge and hybrid workshop held at INTERSPEECH 2022. We describe the four tracks of our challenge along with the baselines, methods, and results. We conclude with a discussion on the new domain-transfer focus of VoxSRC-22, and on the progression of the challenge from the previous three editions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.10248v2-abstract-full').style.display = 'none'; document.getElementById('2302.10248v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 February, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2211.01966">arXiv:2211.01966</a> <span> [<a href="https://arxiv.org/pdf/2211.01966">pdf</a>, <a href="https://arxiv.org/format/2211.01966">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> MarginNCE: Robust Sound Localization with a Negative Margin </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Park%2C+S">Sooyoung Park</a>, <a href="/search/eess?searchtype=author&query=Senocak%2C+A">Arda Senocak</a>, <a href="/search/eess?searchtype=author&query=Chung%2C+J+S">Joon Son Chung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2211.01966v1-abstract-short" style="display: inline;"> The goal of this work is to localize sound sources in visual scenes with a self-supervised approach. Contrastive learning in the context of sound source localization leverages the natural correspondence between audio and visual signals where the audio-visual pairs from the same source are assumed as positive, while randomly selected pairs are negatives. However, this approach brings in noisy corre… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.01966v1-abstract-full').style.display = 'inline'; document.getElementById('2211.01966v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2211.01966v1-abstract-full" style="display: none;"> The goal of this work is to localize sound sources in visual scenes with a self-supervised approach. Contrastive learning in the context of sound source localization leverages the natural correspondence between audio and visual signals where the audio-visual pairs from the same source are assumed as positive, while randomly selected pairs are negatives. However, this approach brings in noisy correspondences; for example, positive audio and visual pair signals that may be unrelated to each other, or negative pairs that may contain semantically similar samples to the positive one. Our key contribution in this work is to show that using a less strict decision boundary in contrastive learning can alleviate the effect of noisy correspondences in sound source localization. We propose a simple yet effective approach by slightly modifying the contrastive loss with a negative margin. Extensive experimental results show that our approach gives on-par or better performance than the state-of-the-art methods. Furthermore, we demonstrate that the introduction of a negative margin to existing methods results in a consistent improvement in performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.01966v1-abstract-full').style.display = 'none'; document.getElementById('2211.01966v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to ICASSP 2023. SOTA performance in Audio-Visual Sound Localization. 5 Pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2211.00439">arXiv:2211.00439</a> <span> [<a href="https://arxiv.org/pdf/2211.00439">pdf</a>, <a href="https://arxiv.org/format/2211.00439">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Metric Learning for User-defined Keyword Spotting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Jung%2C+J">Jaemin Jung</a>, <a href="/search/eess?searchtype=author&query=Kim%2C+Y">Youkyum Kim</a>, <a href="/search/eess?searchtype=author&query=Park%2C+J">Jihwan Park</a>, <a href="/search/eess?searchtype=author&query=Lim%2C+Y">Youshin Lim</a>, <a href="/search/eess?searchtype=author&query=Kim%2C+B">Byeong-Yeol Kim</a>, <a href="/search/eess?searchtype=author&query=Jang%2C+Y">Youngjoon Jang</a>, <a href="/search/eess?searchtype=author&query=Chung%2C+J+S">Joon Son Chung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2211.00439v1-abstract-short" style="display: inline;"> The goal of this work is to detect new spoken terms defined by users. While most previous works address Keyword Spotting (KWS) as a closed-set classification problem, this limits their transferability to unseen terms. The ability to define custom keywords has advantages in terms of user experience. In this paper, we propose a metric learning-based training strategy for user-defined keyword spott… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.00439v1-abstract-full').style.display = 'inline'; document.getElementById('2211.00439v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2211.00439v1-abstract-full" style="display: none;"> The goal of this work is to detect new spoken terms defined by users. While most previous works address Keyword Spotting (KWS) as a closed-set classification problem, this limits their transferability to unseen terms. The ability to define custom keywords has advantages in terms of user experience. In this paper, we propose a metric learning-based training strategy for user-defined keyword spotting. In particular, we make the following contributions: (1) we construct a large-scale keyword dataset with an existing speech corpus and propose a filtering method to remove data that degrade model training; (2) we propose a metric learning-based two-stage training strategy, and demonstrate that the proposed method improves the performance on the user-defined keyword spotting task by enriching their representations; (3) to facilitate the fair comparison in the user-defined KWS field, we propose unified evaluation protocol and metrics. Our proposed system does not require an incremental training on the user-defined keywords, and outperforms previous works by a significant margin on the Google Speech Commands dataset using the proposed as well as the existing metrics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.00439v1-abstract-full').style.display = 'none'; document.getElementById('2211.00439v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2211.00437">arXiv:2211.00437</a> <span> [<a href="https://arxiv.org/pdf/2211.00437">pdf</a>, <a href="https://arxiv.org/format/2211.00437">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Disentangled representation learning for multilingual speaker recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Nam%2C+K">Kihyun Nam</a>, <a href="/search/eess?searchtype=author&query=Kim%2C+Y">Youkyum Kim</a>, <a href="/search/eess?searchtype=author&query=Huh%2C+J">Jaesung Huh</a>, <a href="/search/eess?searchtype=author&query=Heo%2C+H+S">Hee Soo Heo</a>, <a href="/search/eess?searchtype=author&query=Jung%2C+J">Jee-weon Jung</a>, <a href="/search/eess?searchtype=author&query=Chung%2C+J+S">Joon Son Chung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2211.00437v3-abstract-short" style="display: inline;"> The goal of this paper is to learn robust speaker representation for bilingual speaking scenario. The majority of the world's population speak at least two languages; however, most speaker recognition systems fail to recognise the same speaker when speaking in different languages. Popular speaker recognition evaluation sets do not consider the bilingual scenario, making it difficult to analyse t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.00437v3-abstract-full').style.display = 'inline'; document.getElementById('2211.00437v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2211.00437v3-abstract-full" style="display: none;"> The goal of this paper is to learn robust speaker representation for bilingual speaking scenario. The majority of the world's population speak at least two languages; however, most speaker recognition systems fail to recognise the same speaker when speaking in different languages. Popular speaker recognition evaluation sets do not consider the bilingual scenario, making it difficult to analyse the effect of bilingual speakers on speaker recognition performance. In this paper, we publish a large-scale evaluation set named VoxCeleb1-B derived from VoxCeleb that considers bilingual scenarios. We introduce an effective disentanglement learning strategy that combines adversarial and metric learning-based methods. This approach addresses the bilingual situation by disentangling language-related information from speaker representation while ensuring stable speaker representation learning. Our language-disentangled learning method only uses language pseudo-labels without manual information. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.00437v3-abstract-full').style.display = 'none'; document.getElementById('2211.00437v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Interspeech 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2210.14682">arXiv:2210.14682</a> <span> [<a href="https://arxiv.org/pdf/2210.14682">pdf</a>, <a href="https://arxiv.org/format/2210.14682">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> In search of strong embedding extractors for speaker diarisation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Jung%2C+J">Jee-weon Jung</a>, <a href="/search/eess?searchtype=author&query=Heo%2C+H">Hee-Soo Heo</a>, <a href="/search/eess?searchtype=author&query=Lee%2C+B">Bong-Jin Lee</a>, <a href="/search/eess?searchtype=author&query=Huh%2C+J">Jaesung Huh</a>, <a href="/search/eess?searchtype=author&query=Brown%2C+A">Andrew Brown</a>, <a href="/search/eess?searchtype=author&query=Kwon%2C+Y">Youngki Kwon</a>, <a href="/search/eess?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a>, <a href="/search/eess?searchtype=author&query=Chung%2C+J+S">Joon Son Chung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2210.14682v1-abstract-short" style="display: inline;"> Speaker embedding extractors (EEs), which map input audio to a speaker discriminant latent space, are of paramount importance in speaker diarisation. However, there are several challenges when adopting EEs for diarisation, from which we tackle two key problems. First, the evaluation is not straightforward because the features required for better performance differ between speaker verification and… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.14682v1-abstract-full').style.display = 'inline'; document.getElementById('2210.14682v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2210.14682v1-abstract-full" style="display: none;"> Speaker embedding extractors (EEs), which map input audio to a speaker discriminant latent space, are of paramount importance in speaker diarisation. However, there are several challenges when adopting EEs for diarisation, from which we tackle two key problems. First, the evaluation is not straightforward because the features required for better performance differ between speaker verification and diarisation. We show that better performance on widely adopted speaker verification evaluation protocols does not lead to better diarisation performance. Second, embedding extractors have not seen utterances in which multiple speakers exist. These inputs are inevitably present in speaker diarisation because of overlapped speech and speaker changes; they degrade the performance. To mitigate the first problem, we generate speaker verification evaluation protocols that mimic the diarisation scenario better. We propose two data augmentation techniques to alleviate the second problem, making embedding extractors aware of overlapped speech or speaker change input. One technique generates overlapped speech segments, and the other generates segments where two speakers utter sequentially. Extensive experimental results using three state-of-the-art speaker embedding extractors demonstrate that both proposed approaches are effective. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.14682v1-abstract-full').style.display = 'none'; document.getElementById('2210.14682v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5pages, 1 figure, 2 tables, submitted to ICASSP</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2210.10985">arXiv:2210.10985</a> <span> [<a href="https://arxiv.org/pdf/2210.10985">pdf</a>, <a href="https://arxiv.org/ps/2210.10985">ps</a>, <a href="https://arxiv.org/format/2210.10985">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Large-scale learning of generalised representations for speaker recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Jung%2C+J">Jee-weon Jung</a>, <a href="/search/eess?searchtype=author&query=Heo%2C+H">Hee-Soo Heo</a>, <a href="/search/eess?searchtype=author&query=Lee%2C+B">Bong-Jin Lee</a>, <a href="/search/eess?searchtype=author&query=Lee%2C+J">Jaesong Lee</a>, <a href="/search/eess?searchtype=author&query=Shim%2C+H">Hye-jin Shim</a>, <a href="/search/eess?searchtype=author&query=Kwon%2C+Y">Youngki Kwon</a>, <a href="/search/eess?searchtype=author&query=Chung%2C+J+S">Joon Son Chung</a>, <a href="/search/eess?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2210.10985v2-abstract-short" style="display: inline;"> The objective of this work is to develop a speaker recognition model to be used in diverse scenarios. We hypothesise that two components should be adequately configured to build such a model. First, adequate architecture would be required. We explore several recent state-of-the-art models, including ECAPA-TDNN and MFA-Conformer, as well as other baselines. Second, a massive amount of data would be… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.10985v2-abstract-full').style.display = 'inline'; document.getElementById('2210.10985v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2210.10985v2-abstract-full" style="display: none;"> The objective of this work is to develop a speaker recognition model to be used in diverse scenarios. We hypothesise that two components should be adequately configured to build such a model. First, adequate architecture would be required. We explore several recent state-of-the-art models, including ECAPA-TDNN and MFA-Conformer, as well as other baselines. Second, a massive amount of data would be required. We investigate several new training data configurations combining a few existing datasets. The most extensive configuration includes over 87k speakers' 10.22k hours of speech. Four evaluation protocols are adopted to measure how the trained model performs in diverse scenarios. Through experiments, we find that MFA-Conformer with the least inductive bias generalises the best. We also show that training with proposed large data configurations gives better performance. A boost in generalisation is observed, where the average performance on four evaluation protocols improves by more than 20%. In addition, we also demonstrate that these models' performances can improve even further when increasing capacity. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.10985v2-abstract-full').style.display = 'none'; document.getElementById('2210.10985v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5pages, 5 tables, submitted to ICASSP</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2204.09976">arXiv:2204.09976</a> <span> [<a href="https://arxiv.org/pdf/2204.09976">pdf</a>, <a href="https://arxiv.org/format/2204.09976">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Baseline Systems for the First Spoofing-Aware Speaker Verification Challenge: Score and Embedding Fusion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Shim%2C+H">Hye-jin Shim</a>, <a href="/search/eess?searchtype=author&query=Tak%2C+H">Hemlata Tak</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+X">Xuechen Liu</a>, <a href="/search/eess?searchtype=author&query=Heo%2C+H">Hee-Soo Heo</a>, <a href="/search/eess?searchtype=author&query=Jung%2C+J">Jee-weon Jung</a>, <a href="/search/eess?searchtype=author&query=Chung%2C+J+S">Joon Son Chung</a>, <a href="/search/eess?searchtype=author&query=Chung%2C+S">Soo-Whan Chung</a>, <a href="/search/eess?searchtype=author&query=Yu%2C+H">Ha-Jin Yu</a>, <a href="/search/eess?searchtype=author&query=Lee%2C+B">Bong-Jin Lee</a>, <a href="/search/eess?searchtype=author&query=Todisco%2C+M">Massimiliano Todisco</a>, <a href="/search/eess?searchtype=author&query=Delgado%2C+H">H茅ctor Delgado</a>, <a href="/search/eess?searchtype=author&query=Lee%2C+K+A">Kong Aik Lee</a>, <a href="/search/eess?searchtype=author&query=Sahidullah%2C+M">Md Sahidullah</a>, <a href="/search/eess?searchtype=author&query=Kinnunen%2C+T">Tomi Kinnunen</a>, <a href="/search/eess?searchtype=author&query=Evans%2C+N">Nicholas Evans</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2204.09976v1-abstract-short" style="display: inline;"> Deep learning has brought impressive progress in the study of both automatic speaker verification (ASV) and spoofing countermeasures (CM). Although solutions are mutually dependent, they have typically evolved as standalone sub-systems whereby CM solutions are usually designed for a fixed ASV system. The work reported in this paper aims to gauge the improvements in reliability that can be gained f… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2204.09976v1-abstract-full').style.display = 'inline'; document.getElementById('2204.09976v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2204.09976v1-abstract-full" style="display: none;"> Deep learning has brought impressive progress in the study of both automatic speaker verification (ASV) and spoofing countermeasures (CM). Although solutions are mutually dependent, they have typically evolved as standalone sub-systems whereby CM solutions are usually designed for a fixed ASV system. The work reported in this paper aims to gauge the improvements in reliability that can be gained from their closer integration. Results derived using the popular ASVspoof2019 dataset indicate that the equal error rate (EER) of a state-of-the-art ASV system degrades from 1.63% to 23.83% when the evaluation protocol is extended with spoofed trials.%subjected to spoofing attacks. However, even the straightforward integration of ASV and CM systems in the form of score-sum and deep neural network-based fusion strategies reduce the EER to 1.71% and 6.37%, respectively. The new Spoofing-Aware Speaker Verification (SASV) challenge has been formed to encourage greater attention to the integration of ASV and CM systems as well as to provide a means to benchmark different solutions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2204.09976v1-abstract-full').style.display = 'none'; document.getElementById('2204.09976v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 April, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, accepted by Odyssey 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2203.14525">arXiv:2203.14525</a> <span> [<a href="https://arxiv.org/pdf/2203.14525">pdf</a>, <a href="https://arxiv.org/format/2203.14525">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Curriculum learning for self-supervised speaker verification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Heo%2C+H">Hee-Soo Heo</a>, <a href="/search/eess?searchtype=author&query=Jung%2C+J">Jee-weon Jung</a>, <a href="/search/eess?searchtype=author&query=Kang%2C+J">Jingu Kang</a>, <a href="/search/eess?searchtype=author&query=Kwon%2C+Y">Youngki Kwon</a>, <a href="/search/eess?searchtype=author&query=Kim%2C+Y+J">You Jin Kim</a>, <a href="/search/eess?searchtype=author&query=Lee%2C+B">Bong-Jin Lee</a>, <a href="/search/eess?searchtype=author&query=Chung%2C+J+S">Joon Son Chung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2203.14525v4-abstract-short" style="display: inline;"> The goal of this paper is to train effective self-supervised speaker representations without identity labels. We propose two curriculum learning strategies within a self-supervised learning framework. The first strategy aims to gradually increase the number of speakers in the training phase by enlarging the used portion of the train dataset. The second strategy applies various data augmentations t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.14525v4-abstract-full').style.display = 'inline'; document.getElementById('2203.14525v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2203.14525v4-abstract-full" style="display: none;"> The goal of this paper is to train effective self-supervised speaker representations without identity labels. We propose two curriculum learning strategies within a self-supervised learning framework. The first strategy aims to gradually increase the number of speakers in the training phase by enlarging the used portion of the train dataset. The second strategy applies various data augmentations to more utterances within a mini-batch as the training proceeds. A range of experiments conducted using the DINO self-supervised framework on the VoxCeleb1 evaluation protocol demonstrates the effectiveness of our proposed curriculum learning strategies. We report a competitive equal error rate of 4.47% with a single-phase training, and we also demonstrate that the performance further improves to 1.84% by fine-tuning on a small labelled dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.14525v4-abstract-full').style.display = 'none'; document.getElementById('2203.14525v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 March, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">INTERSPEECH 2023. 5 pages, 3 figures, 4 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2203.08488">arXiv:2203.08488</a> <span> [<a href="https://arxiv.org/pdf/2203.08488">pdf</a>, <a href="https://arxiv.org/format/2203.08488">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Pushing the limits of raw waveform speaker recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Jung%2C+J">Jee-weon Jung</a>, <a href="/search/eess?searchtype=author&query=Kim%2C+Y+J">You Jin Kim</a>, <a href="/search/eess?searchtype=author&query=Heo%2C+H">Hee-Soo Heo</a>, <a href="/search/eess?searchtype=author&query=Lee%2C+B">Bong-Jin Lee</a>, <a href="/search/eess?searchtype=author&query=Kwon%2C+Y">Youngki Kwon</a>, <a href="/search/eess?searchtype=author&query=Chung%2C+J+S">Joon Son Chung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2203.08488v2-abstract-short" style="display: inline;"> In recent years, speaker recognition systems based on raw waveform inputs have received increasing attention. However, the performance of such systems are typically inferior to the state-of-the-art handcrafted feature-based counterparts, which demonstrate equal error rates under 1% on the popular VoxCeleb1 test set. This paper proposes a novel speaker recognition model based on raw waveform inputs… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.08488v2-abstract-full').style.display = 'inline'; document.getElementById('2203.08488v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2203.08488v2-abstract-full" style="display: none;"> In recent years, speaker recognition systems based on raw waveform inputs have received increasing attention. However, the performance of such systems are typically inferior to the state-of-the-art handcrafted feature-based counterparts, which demonstrate equal error rates under 1% on the popular VoxCeleb1 test set. This paper proposes a novel speaker recognition model based on raw waveform inputs. The model incorporates recent advances in machine learning and speaker verification, including the Res2Net backbone module and multi-layer feature aggregation. Our best model achieves an equal error rate of 0.89%, which is competitive with the state-of-the-art models based on handcrafted features, and outperforms the best model based on raw waveform inputs by a large margin. We also explore the application of the proposed model in the context of self-supervised learning framework. Our self-supervised model outperforms single phase-based existing works in this line of research. Finally, we show that self-supervised pre-training is effective for the semi-supervised scenario where we only have a small set of labelled training data, along with a larger set of unlabelled examples. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.08488v2-abstract-full').style.display = 'none'; document.getElementById('2203.08488v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 March, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 March, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">submitted to INTERSPEECH 2022 as a conference paper. 5 pages, 2 figures, 5 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2201.04583">arXiv:2201.04583</a> <span> [<a href="https://arxiv.org/pdf/2201.04583">pdf</a>, <a href="https://arxiv.org/format/2201.04583">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> VoxSRC 2021: The Third VoxCeleb Speaker Recognition Challenge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Brown%2C+A">Andrew Brown</a>, <a href="/search/eess?searchtype=author&query=Huh%2C+J">Jaesung Huh</a>, <a href="/search/eess?searchtype=author&query=Chung%2C+J+S">Joon Son Chung</a>, <a href="/search/eess?searchtype=author&query=Nagrani%2C+A">Arsha Nagrani</a>, <a href="/search/eess?searchtype=author&query=Garcia-Romero%2C+D">Daniel Garcia-Romero</a>, <a href="/search/eess?searchtype=author&query=Zisserman%2C+A">Andrew Zisserman</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2201.04583v2-abstract-short" style="display: inline;"> The third instalment of the VoxCeleb Speaker Recognition Challenge was held in conjunction with Interspeech 2021. The aim of this challenge was to assess how well current speaker recognition technology is able to diarise and recognise speakers in unconstrained or `in the wild' data. The challenge consisted of: (i) the provision of publicly available speaker recognition and diarisation data from Yo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2201.04583v2-abstract-full').style.display = 'inline'; document.getElementById('2201.04583v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2201.04583v2-abstract-full" style="display: none;"> The third instalment of the VoxCeleb Speaker Recognition Challenge was held in conjunction with Interspeech 2021. The aim of this challenge was to assess how well current speaker recognition technology is able to diarise and recognise speakers in unconstrained or `in the wild' data. The challenge consisted of: (i) the provision of publicly available speaker recognition and diarisation data from YouTube videos together with ground truth annotation and standardised evaluation software; and (ii) a virtual public challenge and workshop held at Interspeech 2021. This paper outlines the challenge, and describes the baselines, methods and results. We conclude with a discussion on the new multi-lingual focus of VoxSRC 2021, and on the progression of the challenge since the previous two editions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2201.04583v2-abstract-full').style.display = 'none'; document.getElementById('2201.04583v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 January, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">arXiv admin note: substantial text overlap with arXiv:2012.06867</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2110.03361">arXiv:2110.03361</a> <span> [<a href="https://arxiv.org/pdf/2110.03361">pdf</a>, <a href="https://arxiv.org/format/2110.03361">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Multi-scale speaker embedding-based graph attention networks for speaker diarisation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Kwon%2C+Y">Youngki Kwon</a>, <a href="/search/eess?searchtype=author&query=Heo%2C+H">Hee-Soo Heo</a>, <a href="/search/eess?searchtype=author&query=Jung%2C+J">Jee-weon Jung</a>, <a href="/search/eess?searchtype=author&query=Kim%2C+Y+J">You Jin Kim</a>, <a href="/search/eess?searchtype=author&query=Lee%2C+B">Bong-Jin Lee</a>, <a href="/search/eess?searchtype=author&query=Chung%2C+J+S">Joon Son Chung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2110.03361v1-abstract-short" style="display: inline;"> The objective of this work is effective speaker diarisation using multi-scale speaker embeddings. Typically, there is a trade-off between the ability to recognise short speaker segments and the discriminative power of the embedding, according to the segment length used for embedding extraction. To this end, recent works have proposed the use of multi-scale embeddings where segments with varying le… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.03361v1-abstract-full').style.display = 'inline'; document.getElementById('2110.03361v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2110.03361v1-abstract-full" style="display: none;"> The objective of this work is effective speaker diarisation using multi-scale speaker embeddings. Typically, there is a trade-off between the ability to recognise short speaker segments and the discriminative power of the embedding, according to the segment length used for embedding extraction. To this end, recent works have proposed the use of multi-scale embeddings where segments with varying lengths are used. However, the scores are combined using a weighted summation scheme where the weights are fixed after the training phase, whereas the importance of segment lengths can differ with in a single session. To address this issue, we present three key contributions in this paper: (1) we propose graph attention networks for multi-scale speaker diarisation; (2) we design scale indicators to utilise scale information of each embedding; (3) we adapt the attention-based aggregation to utilise a pre-computed affinity matrix from multi-scale embeddings. We demonstrate the effectiveness of our method in various datasets where the speaker confusion which constitutes the primary metric drops over 10% in average relative compared to the baseline. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.03361v1-abstract-full').style.display = 'none'; document.getElementById('2110.03361v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 October, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 2 figures, submitted to ICASSP as a conference paper</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2110.02791">arXiv:2110.02791</a> <span> [<a href="https://arxiv.org/pdf/2110.02791">pdf</a>, <a href="https://arxiv.org/format/2110.02791">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Spell my name: keyword boosted speech recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Jung%2C+N">Namkyu Jung</a>, <a href="/search/eess?searchtype=author&query=Kim%2C+G">Geonmin Kim</a>, <a href="/search/eess?searchtype=author&query=Chung%2C+J+S">Joon Son Chung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2110.02791v1-abstract-short" style="display: inline;"> Recognition of uncommon words such as names and technical terminology is important to understanding conversations in context. However, the ability to recognise such words remains a challenge in modern automatic speech recognition (ASR) systems. In this paper, we propose a simple but powerful ASR decoding method that can better recognise these uncommon keywords, which in turn enables better reada… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.02791v1-abstract-full').style.display = 'inline'; document.getElementById('2110.02791v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2110.02791v1-abstract-full" style="display: none;"> Recognition of uncommon words such as names and technical terminology is important to understanding conversations in context. However, the ability to recognise such words remains a challenge in modern automatic speech recognition (ASR) systems. In this paper, we propose a simple but powerful ASR decoding method that can better recognise these uncommon keywords, which in turn enables better readability of the results. The method boosts the probabilities of given keywords in a beam search based on acoustic model predictions. The method does not require any training in advance. We demonstrate the effectiveness of our method on the LibriSpeeech test sets and also internal data of real-world conversations. Our method significantly boosts keyword accuracy on the test sets, while maintaining the accuracy of the other words, and as well as providing significant qualitative improvements. This method is applicable to other tasks such as machine translation, or wherever unseen and difficult keywords need to be recognised in beam search. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.02791v1-abstract-full').style.display = 'none'; document.getElementById('2110.02791v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 October, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2110.01200">arXiv:2110.01200</a> <span> [<a href="https://arxiv.org/pdf/2110.01200">pdf</a>, <a href="https://arxiv.org/format/2110.01200">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> AASIST: Audio Anti-Spoofing using Integrated Spectro-Temporal Graph Attention Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Jung%2C+J">Jee-weon Jung</a>, <a href="/search/eess?searchtype=author&query=Heo%2C+H">Hee-Soo Heo</a>, <a href="/search/eess?searchtype=author&query=Tak%2C+H">Hemlata Tak</a>, <a href="/search/eess?searchtype=author&query=Shim%2C+H">Hye-jin Shim</a>, <a href="/search/eess?searchtype=author&query=Chung%2C+J+S">Joon Son Chung</a>, <a href="/search/eess?searchtype=author&query=Lee%2C+B">Bong-Jin Lee</a>, <a href="/search/eess?searchtype=author&query=Yu%2C+H">Ha-Jin Yu</a>, <a href="/search/eess?searchtype=author&query=Evans%2C+N">Nicholas Evans</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2110.01200v1-abstract-short" style="display: inline;"> Artefacts that differentiate spoofed from bona-fide utterances can reside in spectral or temporal domains. Their reliable detection usually depends upon computationally demanding ensemble systems where each subsystem is tuned to some specific artefacts. We seek to develop an efficient, single system that can detect a broad range of different spoofing attacks without score-level ensembles. We propo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.01200v1-abstract-full').style.display = 'inline'; document.getElementById('2110.01200v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2110.01200v1-abstract-full" style="display: none;"> Artefacts that differentiate spoofed from bona-fide utterances can reside in spectral or temporal domains. Their reliable detection usually depends upon computationally demanding ensemble systems where each subsystem is tuned to some specific artefacts. We seek to develop an efficient, single system that can detect a broad range of different spoofing attacks without score-level ensembles. We propose a novel heterogeneous stacking graph attention layer which models artefacts spanning heterogeneous temporal and spectral domains with a heterogeneous attention mechanism and a stack node. With a new max graph operation that involves a competitive mechanism and an extended readout scheme, our approach, named AASIST, outperforms the current state-of-the-art by 20% relative. Even a lightweight variant, AASIST-L, with only 85K parameters, outperforms all competing systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.01200v1-abstract-full').style.display = 'none'; document.getElementById('2110.01200v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 October, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 1 figure, 3 tables, submitted to ICASSP2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2108.07640">arXiv:2108.07640</a> <span> [<a href="https://arxiv.org/pdf/2108.07640">pdf</a>, <a href="https://arxiv.org/format/2108.07640">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Look Who's Talking: Active Speaker Detection in the Wild </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Kim%2C+Y+J">You Jin Kim</a>, <a href="/search/eess?searchtype=author&query=Heo%2C+H">Hee-Soo Heo</a>, <a href="/search/eess?searchtype=author&query=Choe%2C+S">Soyeon Choe</a>, <a href="/search/eess?searchtype=author&query=Chung%2C+S">Soo-Whan Chung</a>, <a href="/search/eess?searchtype=author&query=Kwon%2C+Y">Yoohwan Kwon</a>, <a href="/search/eess?searchtype=author&query=Lee%2C+B">Bong-Jin Lee</a>, <a href="/search/eess?searchtype=author&query=Kwon%2C+Y">Youngki Kwon</a>, <a href="/search/eess?searchtype=author&query=Chung%2C+J+S">Joon Son Chung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2108.07640v1-abstract-short" style="display: inline;"> In this work, we present a novel audio-visual dataset for active speaker detection in the wild. A speaker is considered active when his or her face is visible and the voice is audible simultaneously. Although active speaker detection is a crucial pre-processing step for many audio-visual tasks, there is no existing dataset of natural human speech to evaluate the performance of active speaker detec… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2108.07640v1-abstract-full').style.display = 'inline'; document.getElementById('2108.07640v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2108.07640v1-abstract-full" style="display: none;"> In this work, we present a novel audio-visual dataset for active speaker detection in the wild. A speaker is considered active when his or her face is visible and the voice is audible simultaneously. Although active speaker detection is a crucial pre-processing step for many audio-visual tasks, there is no existing dataset of natural human speech to evaluate the performance of active speaker detection. We therefore curate the Active Speakers in the Wild (ASW) dataset which contains videos and co-occurring speech segments with dense speech activity labels. Videos and timestamps of audible segments are parsed and adopted from VoxConverse, an existing speaker diarisation dataset that consists of videos in the wild. Face tracks are extracted from the videos and active segments are annotated based on the timestamps of VoxConverse in a semi-automatic way. Two reference systems, a self-supervised system and a fully supervised one, are evaluated on the dataset to provide the baseline performances of ASW. Cross-domain evaluation is conducted in order to show the negative effect of dubbed videos in the training data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2108.07640v1-abstract-full').style.display = 'none'; document.getElementById('2108.07640v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 August, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To appear in Interspeech 2021. Data will be available from https://github.com/clovaai/lookwhostalking</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2104.02879">arXiv:2104.02879</a> <span> [<a href="https://arxiv.org/pdf/2104.02879">pdf</a>, <a href="https://arxiv.org/format/2104.02879">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Adapting Speaker Embeddings for Speaker Diarisation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Kwon%2C+Y">Youngki Kwon</a>, <a href="/search/eess?searchtype=author&query=Jung%2C+J">Jee-weon Jung</a>, <a href="/search/eess?searchtype=author&query=Heo%2C+H">Hee-Soo Heo</a>, <a href="/search/eess?searchtype=author&query=Kim%2C+Y+J">You Jin Kim</a>, <a href="/search/eess?searchtype=author&query=Lee%2C+B">Bong-Jin Lee</a>, <a href="/search/eess?searchtype=author&query=Chung%2C+J+S">Joon Son Chung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2104.02879v1-abstract-short" style="display: inline;"> The goal of this paper is to adapt speaker embeddings for solving the problem of speaker diarisation. The quality of speaker embeddings is paramount to the performance of speaker diarisation systems. Despite this, prior works in the field have directly used embeddings designed only to be effective on the speaker verification task. In this paper, we propose three techniques that can be used to bett… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2104.02879v1-abstract-full').style.display = 'inline'; document.getElementById('2104.02879v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2104.02879v1-abstract-full" style="display: none;"> The goal of this paper is to adapt speaker embeddings for solving the problem of speaker diarisation. The quality of speaker embeddings is paramount to the performance of speaker diarisation systems. Despite this, prior works in the field have directly used embeddings designed only to be effective on the speaker verification task. In this paper, we propose three techniques that can be used to better adapt the speaker embeddings for diarisation: dimensionality reduction, attention-based embedding aggregation, and non-speech clustering. A wide range of experiments is performed on various challenging datasets. The results demonstrate that all three techniques contribute positively to the performance of the diarisation system achieving an average relative improvement of 25.07% in terms of diarisation error rate over the baseline. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2104.02879v1-abstract-full').style.display = 'none'; document.getElementById('2104.02879v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 April, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 2 figures, 3 tables, submitted to Interspeech as a conference paper</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2104.02878">arXiv:2104.02878</a> <span> [<a href="https://arxiv.org/pdf/2104.02878">pdf</a>, <a href="https://arxiv.org/format/2104.02878">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Three-class Overlapped Speech Detection using a Convolutional Recurrent Neural Network </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Jung%2C+J">Jee-weon Jung</a>, <a href="/search/eess?searchtype=author&query=Heo%2C+H">Hee-Soo Heo</a>, <a href="/search/eess?searchtype=author&query=Kwon%2C+Y">Youngki Kwon</a>, <a href="/search/eess?searchtype=author&query=Chung%2C+J+S">Joon Son Chung</a>, <a href="/search/eess?searchtype=author&query=Lee%2C+B">Bong-Jin Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2104.02878v1-abstract-short" style="display: inline;"> In this work, we propose an overlapped speech detection system trained as a three-class classifier. Unlike conventional systems that perform binary classification as to whether or not a frame contains overlapped speech, the proposed approach classifies into three classes: non-speech, single speaker speech, and overlapped speech. By training a network with the more detailed label definition, the mo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2104.02878v1-abstract-full').style.display = 'inline'; document.getElementById('2104.02878v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2104.02878v1-abstract-full" style="display: none;"> In this work, we propose an overlapped speech detection system trained as a three-class classifier. Unlike conventional systems that perform binary classification as to whether or not a frame contains overlapped speech, the proposed approach classifies into three classes: non-speech, single speaker speech, and overlapped speech. By training a network with the more detailed label definition, the model can learn a better notion on deciding the number of speakers included in a given frame. A convolutional recurrent neural network architecture is explored to benefit from both convolutional layer's capability to model local patterns and recurrent layer's ability to model sequential information. The proposed overlapped speech detection model establishes a state-of-the-art performance with a precision of 0.6648 and a recall of 0.3222 on the DIHARD II evaluation set, showing a 20% increase in recall along with higher precision. In addition, we also introduce a simple approach to utilize the proposed overlapped speech detection model for speaker diarization which ranked third place in the Track 1 of the DIHARD III challenge. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2104.02878v1-abstract-full').style.display = 'none'; document.getElementById('2104.02878v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 April, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 2 figures, 4 tables, submitted to Interspeech as a conference paper</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2012.06867">arXiv:2012.06867</a> <span> [<a href="https://arxiv.org/pdf/2012.06867">pdf</a>, <a href="https://arxiv.org/format/2012.06867">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> VoxSRC 2020: The Second VoxCeleb Speaker Recognition Challenge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Nagrani%2C+A">Arsha Nagrani</a>, <a href="/search/eess?searchtype=author&query=Chung%2C+J+S">Joon Son Chung</a>, <a href="/search/eess?searchtype=author&query=Huh%2C+J">Jaesung Huh</a>, <a href="/search/eess?searchtype=author&query=Brown%2C+A">Andrew Brown</a>, <a href="/search/eess?searchtype=author&query=Coto%2C+E">Ernesto Coto</a>, <a href="/search/eess?searchtype=author&query=Xie%2C+W">Weidi Xie</a>, <a href="/search/eess?searchtype=author&query=McLaren%2C+M">Mitchell McLaren</a>, <a href="/search/eess?searchtype=author&query=Reynolds%2C+D+A">Douglas A Reynolds</a>, <a href="/search/eess?searchtype=author&query=Zisserman%2C+A">Andrew Zisserman</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2012.06867v1-abstract-short" style="display: inline;"> We held the second installment of the VoxCeleb Speaker Recognition Challenge in conjunction with Interspeech 2020. The goal of this challenge was to assess how well current speaker recognition technology is able to diarise and recognize speakers in unconstrained or `in the wild' data. It consisted of: (i) a publicly available speaker recognition and diarisation dataset from YouTube videos together… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2012.06867v1-abstract-full').style.display = 'inline'; document.getElementById('2012.06867v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2012.06867v1-abstract-full" style="display: none;"> We held the second installment of the VoxCeleb Speaker Recognition Challenge in conjunction with Interspeech 2020. The goal of this challenge was to assess how well current speaker recognition technology is able to diarise and recognize speakers in unconstrained or `in the wild' data. It consisted of: (i) a publicly available speaker recognition and diarisation dataset from YouTube videos together with ground truth annotation and standardised evaluation software; and (ii) a virtual public challenge and workshop held at Interspeech 2020. This paper outlines the challenge, and describes the baselines, methods used, and results. We conclude with a discussion of the progress over the first installment of the challenge. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2012.06867v1-abstract-full').style.display = 'none'; document.getElementById('2012.06867v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 December, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2020. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2011.14885">arXiv:2011.14885</a> <span> [<a href="https://arxiv.org/pdf/2011.14885">pdf</a>, <a href="https://arxiv.org/ps/2011.14885">ps</a>, <a href="https://arxiv.org/format/2011.14885">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Look who's not talking </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Kwon%2C+Y">Youngki Kwon</a>, <a href="/search/eess?searchtype=author&query=Heo%2C+H+S">Hee Soo Heo</a>, <a href="/search/eess?searchtype=author&query=Huh%2C+J">Jaesung Huh</a>, <a href="/search/eess?searchtype=author&query=Lee%2C+B">Bong-Jin Lee</a>, <a href="/search/eess?searchtype=author&query=Chung%2C+J+S">Joon Son Chung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2011.14885v1-abstract-short" style="display: inline;"> The objective of this work is speaker diarisation of speech recordings 'in the wild'. The ability to determine speech segments is a crucial part of diarisation systems, accounting for a large proportion of errors. In this paper, we present a simple but effective solution for speech activity detection based on the speaker embeddings. In particular, we discover that the norm of the speaker embedding… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2011.14885v1-abstract-full').style.display = 'inline'; document.getElementById('2011.14885v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2011.14885v1-abstract-full" style="display: none;"> The objective of this work is speaker diarisation of speech recordings 'in the wild'. The ability to determine speech segments is a crucial part of diarisation systems, accounting for a large proportion of errors. In this paper, we present a simple but effective solution for speech activity detection based on the speaker embeddings. In particular, we discover that the norm of the speaker embedding is an extremely effective indicator of speech activity. The method does not require an independent model for speech activity detection, therefore allows speaker diarisation to be performed using a unified representation for both speaker modelling and speech activity detection. We perform a number of experiments on in-house and public datasets, in which our method outperforms popular baselines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2011.14885v1-abstract-full').style.display = 'none'; document.getElementById('2011.14885v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 November, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">SLT 2021</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2011.05189">arXiv:2011.05189</a> <span> [<a href="https://arxiv.org/pdf/2011.05189">pdf</a>, <a href="https://arxiv.org/format/2011.05189">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Supervised attention for speaker recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Kye%2C+S+M">Seong Min Kye</a>, <a href="/search/eess?searchtype=author&query=Chung%2C+J+S">Joon Son Chung</a>, <a href="/search/eess?searchtype=author&query=Kim%2C+H">Hoirin Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2011.05189v2-abstract-short" style="display: inline;"> The recently proposed self-attentive pooling (SAP) has shown good performance in several speaker recognition systems. In SAP systems, the context vector is trained end-to-end together with the feature extractor, where the role of context vector is to select the most discriminative frames for speaker recognition. However, the SAP underperforms compared to the temporal average pooling (TAP) baseline… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2011.05189v2-abstract-full').style.display = 'inline'; document.getElementById('2011.05189v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2011.05189v2-abstract-full" style="display: none;"> The recently proposed self-attentive pooling (SAP) has shown good performance in several speaker recognition systems. In SAP systems, the context vector is trained end-to-end together with the feature extractor, where the role of context vector is to select the most discriminative frames for speaker recognition. However, the SAP underperforms compared to the temporal average pooling (TAP) baseline in some settings, which implies that the attention is not learnt effectively in end-to-end training. To tackle this problem, we introduce strategies for training the attention mechanism in a supervised manner, which learns the context vector using classified samples. With our proposed methods, context vector can be boosted to select the most informative frames. We show that our method outperforms existing methods in various experimental settings including short utterance speaker recognition, and achieves competitive performance over the existing baselines on the VoxCeleb datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2011.05189v2-abstract-full').style.display = 'none'; document.getElementById('2011.05189v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 December, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 November, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">SLT 2021</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2010.15809">arXiv:2010.15809</a> <span> [<a href="https://arxiv.org/pdf/2010.15809">pdf</a>, <a href="https://arxiv.org/format/2010.15809">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> The ins and outs of speaker recognition: lessons from VoxSRC 2020 </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Kwon%2C+Y">Yoohwan Kwon</a>, <a href="/search/eess?searchtype=author&query=Heo%2C+H">Hee-Soo Heo</a>, <a href="/search/eess?searchtype=author&query=Lee%2C+B">Bong-Jin Lee</a>, <a href="/search/eess?searchtype=author&query=Chung%2C+J+S">Joon Son Chung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2010.15809v1-abstract-short" style="display: inline;"> The VoxCeleb Speaker Recognition Challenge (VoxSRC) at Interspeech 2020 offers a challenging evaluation for speaker recognition systems, which includes celebrities playing different parts in movies. The goal of this work is robust speaker recognition of utterances recorded in these challenging environments. We utilise variants of the popular ResNet architecture for speaker recognition and perform… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2010.15809v1-abstract-full').style.display = 'inline'; document.getElementById('2010.15809v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2010.15809v1-abstract-full" style="display: none;"> The VoxCeleb Speaker Recognition Challenge (VoxSRC) at Interspeech 2020 offers a challenging evaluation for speaker recognition systems, which includes celebrities playing different parts in movies. The goal of this work is robust speaker recognition of utterances recorded in these challenging environments. We utilise variants of the popular ResNet architecture for speaker recognition and perform extensive experiments using a range of loss functions and training parameters. To this end, we optimise an efficient training framework that allows powerful models to be trained with limited time and resources. Our trained models demonstrate improvements over most existing works with lighter models and a simple pipeline. The paper shares the lessons learned from our participation in the challenge. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2010.15809v1-abstract-full').style.display = 'none'; document.getElementById('2010.15809v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2020. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2010.15716">arXiv:2010.15716</a> <span> [<a href="https://arxiv.org/pdf/2010.15716">pdf</a>, <a href="https://arxiv.org/format/2010.15716">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Playing a Part: Speaker Verification at the Movies </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Brown%2C+A">Andrew Brown</a>, <a href="/search/eess?searchtype=author&query=Huh%2C+J">Jaesung Huh</a>, <a href="/search/eess?searchtype=author&query=Nagrani%2C+A">Arsha Nagrani</a>, <a href="/search/eess?searchtype=author&query=Chung%2C+J+S">Joon Son Chung</a>, <a href="/search/eess?searchtype=author&query=Zisserman%2C+A">Andrew Zisserman</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2010.15716v2-abstract-short" style="display: inline;"> The goal of this work is to investigate the performance of popular speaker recognition models on speech segments from movies, where often actors intentionally disguise their voice to play a character. We make the following three contributions: (i) We collect a novel, challenging speaker recognition dataset called VoxMovies, with speech for 856 identities from almost 4000 movie clips. VoxMovies con… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2010.15716v2-abstract-full').style.display = 'inline'; document.getElementById('2010.15716v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2010.15716v2-abstract-full" style="display: none;"> The goal of this work is to investigate the performance of popular speaker recognition models on speech segments from movies, where often actors intentionally disguise their voice to play a character. We make the following three contributions: (i) We collect a novel, challenging speaker recognition dataset called VoxMovies, with speech for 856 identities from almost 4000 movie clips. VoxMovies contains utterances with varying emotion, accents and background noise, and therefore comprises an entirely different domain to the interview-style, emotionally calm utterances in current speaker recognition datasets such as VoxCeleb; (ii) We provide a number of domain adaptation evaluation sets, and benchmark the performance of state-of-the-art speaker recognition models on these evaluation pairs. We demonstrate that both speaker verification and identification performance drops steeply on this new data, showing the challenge in transferring models across domains; and finally (iii) We show that simple domain adaptation paradigms improve performance, but there is still large room for improvement. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2010.15716v2-abstract-full').style.display = 'none'; document.getElementById('2010.15716v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 October, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The first three authors contributed equally to this work</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2010.11543">arXiv:2010.11543</a> <span> [<a href="https://arxiv.org/pdf/2010.11543">pdf</a>, <a href="https://arxiv.org/format/2010.11543">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Graph Attention Networks for Speaker Verification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Jung%2C+J">Jee-weon Jung</a>, <a href="/search/eess?searchtype=author&query=Heo%2C+H">Hee-Soo Heo</a>, <a href="/search/eess?searchtype=author&query=Yu%2C+H">Ha-Jin Yu</a>, <a href="/search/eess?searchtype=author&query=Chung%2C+J+S">Joon Son Chung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2010.11543v2-abstract-short" style="display: inline;"> This work presents a novel back-end framework for speaker verification using graph attention networks. Segment-wise speaker embeddings extracted from multiple crops within an utterance are interpreted as node representations of a graph. The proposed framework inputs segment-wise speaker embeddings from an enrollment and a test utterance and directly outputs a similarity score. We first construct a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2010.11543v2-abstract-full').style.display = 'inline'; document.getElementById('2010.11543v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2010.11543v2-abstract-full" style="display: none;"> This work presents a novel back-end framework for speaker verification using graph attention networks. Segment-wise speaker embeddings extracted from multiple crops within an utterance are interpreted as node representations of a graph. The proposed framework inputs segment-wise speaker embeddings from an enrollment and a test utterance and directly outputs a similarity score. We first construct a graph using segment-wise speaker embeddings and then input these to graph attention networks. After a few graph attention layers with residual connections, each node is projected into a one-dimensional space using affine transform, followed by a readout operation resulting in a scalar similarity score. To enable successful adaptation for speaker verification, we propose techniques such as separating trainable weights for attention map calculations between segment-wise speaker embeddings from different utterances. The effectiveness of the proposed framework is validated using three different speaker embedding extractors trained with different architectures and objective functions. Experimental results demonstrate consistent improvement over various baseline back-end classifiers, with an average equal error rate improvement of 20% over the cosine similarity back-end without test time augmentation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2010.11543v2-abstract-full').style.display = 'none'; document.getElementById('2010.11543v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 February, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 October, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 1 figure, 2 tables, accepted for presentation at ICASSP 2021 as a conference paper</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2009.14153">arXiv:2009.14153</a> <span> [<a href="https://arxiv.org/pdf/2009.14153">pdf</a>, <a href="https://arxiv.org/format/2009.14153">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Clova Baseline System for the VoxCeleb Speaker Recognition Challenge 2020 </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Heo%2C+H+S">Hee Soo Heo</a>, <a href="/search/eess?searchtype=author&query=Lee%2C+B">Bong-Jin Lee</a>, <a href="/search/eess?searchtype=author&query=Huh%2C+J">Jaesung Huh</a>, <a href="/search/eess?searchtype=author&query=Chung%2C+J+S">Joon Son Chung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2009.14153v1-abstract-short" style="display: inline;"> This report describes our submission to the VoxCeleb Speaker Recognition Challenge (VoxSRC) at Interspeech 2020. We perform a careful analysis of speaker recognition models based on the popular ResNet architecture, and train a number of variants using a range of loss functions. Our results show significant improvements over most existing works without the use of model ensemble or post-processing.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2009.14153v1-abstract-full').style.display = 'inline'; document.getElementById('2009.14153v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2009.14153v1-abstract-full" style="display: none;"> This report describes our submission to the VoxCeleb Speaker Recognition Challenge (VoxSRC) at Interspeech 2020. We perform a careful analysis of speaker recognition models based on the popular ResNet architecture, and train a number of variants using a range of loss functions. Our results show significant improvements over most existing works without the use of model ensemble or post-processing. We release the training code and pre-trained models as unofficial baselines for this year's challenge. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2009.14153v1-abstract-full').style.display = 'none'; document.getElementById('2009.14153v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 September, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2020. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2008.05983">arXiv:2008.05983</a> <span> [<a href="https://arxiv.org/pdf/2008.05983">pdf</a>, <a href="https://arxiv.org/format/2008.05983">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Cross attentive pooling for speaker verification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Kye%2C+S+M">Seong Min Kye</a>, <a href="/search/eess?searchtype=author&query=Kwon%2C+Y">Yoohwan Kwon</a>, <a href="/search/eess?searchtype=author&query=Chung%2C+J+S">Joon Son Chung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2008.05983v2-abstract-short" style="display: inline;"> The goal of this paper is text-independent speaker verification where utterances come from 'in the wild' videos and may contain irrelevant signal. While speaker verification is naturally a pair-wise problem, existing methods to produce the speaker embeddings are instance-wise. In this paper, we propose Cross Attentive Pooling (CAP) that utilizes the context information across the reference-query p… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2008.05983v2-abstract-full').style.display = 'inline'; document.getElementById('2008.05983v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2008.05983v2-abstract-full" style="display: none;"> The goal of this paper is text-independent speaker verification where utterances come from 'in the wild' videos and may contain irrelevant signal. While speaker verification is naturally a pair-wise problem, existing methods to produce the speaker embeddings are instance-wise. In this paper, we propose Cross Attentive Pooling (CAP) that utilizes the context information across the reference-query pair to generate utterance-level embeddings that contain the most discriminative information for the pair-wise matching problem. Experiments are performed on the VoxCeleb dataset in which our method outperforms comparable pooling strategies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2008.05983v2-abstract-full').style.display = 'none'; document.getElementById('2008.05983v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 December, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 August, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">SLT 2021. Code available at https://github.com/seongmin-kye/CAP</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Chung%2C+J+S&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Chung%2C+J+S&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Chung%2C+J+S&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>