CINXE.COM

Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1&ndash;42 of 42 results for author: <span class="mathjax">Heo, H</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> <div class="content"> <form method="GET" action="/search/eess" aria-role="search"> Searching in archive <strong>eess</strong>. <a href="/search/?searchtype=author&amp;query=Heo%2C+H">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Heo, H"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Heo%2C+H&amp;terms-0-field=author&amp;size=50&amp;order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Heo, H"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.14559">arXiv:2406.14559</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.14559">pdf</a>, <a href="https://arxiv.org/format/2406.14559">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Disentangled Representation Learning for Environment-agnostic Speaker Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Nam%2C+K">KiHyun Nam</a>, <a href="/search/eess?searchtype=author&amp;query=Heo%2C+H">Hee-Soo Heo</a>, <a href="/search/eess?searchtype=author&amp;query=Jung%2C+J">Jee-weon Jung</a>, <a href="/search/eess?searchtype=author&amp;query=Chung%2C+J+S">Joon Son Chung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.14559v1-abstract-short" style="display: inline;"> This work presents a framework based on feature disentanglement to learn speaker embeddings that are robust to environmental variations. Our framework utilises an auto-encoder as a disentangler, dividing the input speaker embedding into components related to the speaker and other residual information. We employ a group of objective functions to ensure that the auto-encoder&#39;s code representation -&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.14559v1-abstract-full').style.display = 'inline'; document.getElementById('2406.14559v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.14559v1-abstract-full" style="display: none;"> This work presents a framework based on feature disentanglement to learn speaker embeddings that are robust to environmental variations. Our framework utilises an auto-encoder as a disentangler, dividing the input speaker embedding into components related to the speaker and other residual information. We employ a group of objective functions to ensure that the auto-encoder&#39;s code representation - used as the refined embedding - condenses only the speaker characteristics. We show the versatility of our framework through its compatibility with any existing speaker embedding extractor, requiring no structural modifications or adaptations for integration. We validate the effectiveness of our framework by incorporating it into two popularly used embedding extractors and conducting experiments across various benchmarks. The results show a performance improvement of up to 16%. We release our code for this work to be available https://github.com/kaistmm/voxceleb-disentangler <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.14559v1-abstract-full').style.display = 'none'; document.getElementById('2406.14559v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Interspeech 2024. The official webpage can be found at https://mm.kaist.ac.kr/projects/voxceleb-disentangler/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.08603">arXiv:2312.08603</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2312.08603">pdf</a>, <a href="https://arxiv.org/format/2312.08603">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> NeXt-TDNN: Modernizing Multi-Scale Temporal Convolution Backbone for Speaker Verification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Heo%2C+H">Hyun-Jun Heo</a>, <a href="/search/eess?searchtype=author&amp;query=Shin%2C+U">Ui-Hyeop Shin</a>, <a href="/search/eess?searchtype=author&amp;query=Lee%2C+R">Ran Lee</a>, <a href="/search/eess?searchtype=author&amp;query=Cheon%2C+Y">YoungJu Cheon</a>, <a href="/search/eess?searchtype=author&amp;query=Park%2C+H">Hyung-Min Park</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.08603v2-abstract-short" style="display: inline;"> In speaker verification, ECAPA-TDNN has shown remarkable improvement by utilizing one-dimensional(1D) Res2Net block and squeeze-and-excitation(SE) module, along with multi-layer feature aggregation (MFA). Meanwhile, in vision tasks, ConvNet structures have been modernized by referring to Transformer, resulting in improved performance. In this paper, we present an improved block design for TDNN in&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.08603v2-abstract-full').style.display = 'inline'; document.getElementById('2312.08603v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.08603v2-abstract-full" style="display: none;"> In speaker verification, ECAPA-TDNN has shown remarkable improvement by utilizing one-dimensional(1D) Res2Net block and squeeze-and-excitation(SE) module, along with multi-layer feature aggregation (MFA). Meanwhile, in vision tasks, ConvNet structures have been modernized by referring to Transformer, resulting in improved performance. In this paper, we present an improved block design for TDNN in speaker verification. Inspired by recent ConvNet structures, we replace the SE-Res2Net block in ECAPA-TDNN with a novel 1D two-step multi-scale ConvNeXt block, which we call TS-ConvNeXt. The TS-ConvNeXt block is constructed using two separated sub-modules: a temporal multi-scale convolution (MSC) and a frame-wise feed-forward network (FFN). This two-step design allows for flexible capturing of inter-frame and intra-frame contexts. Additionally, we introduce global response normalization (GRN) for the FFN modules to enable more selective feature propagation, similar to the SE module in ECAPA-TDNN. Experimental results demonstrate that NeXt-TDNN, with a modernized backbone block, significantly improved performance in speaker verification tasks while reducing parameter size and inference time. We have released our code for future studies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.08603v2-abstract-full').style.display = 'none'; document.getElementById('2312.08603v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICASSP 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.14741">arXiv:2309.14741</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2309.14741">pdf</a>, <a href="https://arxiv.org/format/2309.14741">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Rethinking Session Variability: Leveraging Session Embeddings for Session Robustness in Speaker Verification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Heo%2C+H">Hee-Soo Heo</a>, <a href="/search/eess?searchtype=author&amp;query=Nam%2C+K">KiHyun Nam</a>, <a href="/search/eess?searchtype=author&amp;query=Lee%2C+B">Bong-Jin Lee</a>, <a href="/search/eess?searchtype=author&amp;query=Kwon%2C+Y">Youngki Kwon</a>, <a href="/search/eess?searchtype=author&amp;query=Lee%2C+M">Minjae Lee</a>, <a href="/search/eess?searchtype=author&amp;query=Kim%2C+Y+J">You Jin Kim</a>, <a href="/search/eess?searchtype=author&amp;query=Chung%2C+J+S">Joon Son Chung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.14741v1-abstract-short" style="display: inline;"> In the field of speaker verification, session or channel variability poses a significant challenge. While many contemporary methods aim to disentangle session information from speaker embeddings, we introduce a novel approach using an additional embedding to represent the session information. This is achieved by training an auxiliary network appended to the speaker embedding extractor which remain&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.14741v1-abstract-full').style.display = 'inline'; document.getElementById('2309.14741v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.14741v1-abstract-full" style="display: none;"> In the field of speaker verification, session or channel variability poses a significant challenge. While many contemporary methods aim to disentangle session information from speaker embeddings, we introduce a novel approach using an additional embedding to represent the session information. This is achieved by training an auxiliary network appended to the speaker embedding extractor which remains fixed in this training process. This results in two similarity scores: one for the speakers information and one for the session information. The latter score acts as a compensator for the former that might be skewed due to session variations. Our extensive experiments demonstrate that session information can be effectively compensated without retraining of the embedding extractor. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.14741v1-abstract-full').style.display = 'none'; document.getElementById('2309.14741v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.00680">arXiv:2306.00680</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2306.00680">pdf</a>, <a href="https://arxiv.org/format/2306.00680">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Encoder-decoder multimodal speaker change detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Jung%2C+J">Jee-weon Jung</a>, <a href="/search/eess?searchtype=author&amp;query=Seo%2C+S">Soonshin Seo</a>, <a href="/search/eess?searchtype=author&amp;query=Heo%2C+H">Hee-Soo Heo</a>, <a href="/search/eess?searchtype=author&amp;query=Kim%2C+G">Geonmin Kim</a>, <a href="/search/eess?searchtype=author&amp;query=Kim%2C+Y+J">You Jin Kim</a>, <a href="/search/eess?searchtype=author&amp;query=Kwon%2C+Y">Young-ki Kwon</a>, <a href="/search/eess?searchtype=author&amp;query=Lee%2C+M">Minjae Lee</a>, <a href="/search/eess?searchtype=author&amp;query=Lee%2C+B">Bong-Jin Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.00680v1-abstract-short" style="display: inline;"> The task of speaker change detection (SCD), which detects points where speakers change in an input, is essential for several applications. Several studies solved the SCD task using audio inputs only and have shown limited performance. Recently, multimodal SCD (MMSCD) models, which utilise text modality in addition to audio, have shown improved performance. In this study, the proposed model are bui&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.00680v1-abstract-full').style.display = 'inline'; document.getElementById('2306.00680v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.00680v1-abstract-full" style="display: none;"> The task of speaker change detection (SCD), which detects points where speakers change in an input, is essential for several applications. Several studies solved the SCD task using audio inputs only and have shown limited performance. Recently, multimodal SCD (MMSCD) models, which utilise text modality in addition to audio, have shown improved performance. In this study, the proposed model are built upon two main proposals, a novel mechanism for modality fusion and the adoption of a encoder-decoder architecture. Different to previous MMSCD works that extract speaker embeddings from extremely short audio segments, aligned to a single word, we use a speaker embedding extracted from 1.5s. A transformer decoder layer further improves the performance of an encoder-only MMSCD model. The proposed model achieves state-of-the-art results among studies that report SCD performance and is also on par with recent work that combines SCD with automatic speech recognition via human transcription. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.00680v1-abstract-full').style.display = 'none'; document.getElementById('2306.00680v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, accepted for presentation at INTERSPEECH 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2304.03940">arXiv:2304.03940</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2304.03940">pdf</a>, <a href="https://arxiv.org/format/2304.03940">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Unsupervised Speech Representation Pooling Using Vector Quantization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Park%2C+J">Jeongkyun Park</a>, <a href="/search/eess?searchtype=author&amp;query=Choi%2C+K">Kwanghee Choi</a>, <a href="/search/eess?searchtype=author&amp;query=Heo%2C+H">Hyunjun Heo</a>, <a href="/search/eess?searchtype=author&amp;query=Park%2C+H">Hyung-Min Park</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2304.03940v1-abstract-short" style="display: inline;"> With the advent of general-purpose speech representations from large-scale self-supervised models, applying a single model to multiple downstream tasks is becoming a de-facto approach. However, the pooling problem remains; the length of speech representations is inherently variable. The naive average pooling is often used, even though it ignores the characteristics of speech, such as differently l&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.03940v1-abstract-full').style.display = 'inline'; document.getElementById('2304.03940v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2304.03940v1-abstract-full" style="display: none;"> With the advent of general-purpose speech representations from large-scale self-supervised models, applying a single model to multiple downstream tasks is becoming a de-facto approach. However, the pooling problem remains; the length of speech representations is inherently variable. The naive average pooling is often used, even though it ignores the characteristics of speech, such as differently lengthed phonemes. Hence, we design a novel pooling method to squash acoustically similar representations via vector quantization, which does not require additional training, unlike attention-based pooling. Further, we evaluate various unsupervised pooling methods on various self-supervised models. We gather diverse methods scattered around speech and text to evaluate on various tasks: keyword spotting, speaker identification, intent classification, and emotion recognition. Finally, we quantitatively and qualitatively analyze our method, comparing it with supervised pooling methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.03940v1-abstract-full').style.display = 'none'; document.getElementById('2304.03940v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 April, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2211.04768">arXiv:2211.04768</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2211.04768">pdf</a>, <a href="https://arxiv.org/format/2211.04768">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Absolute decision corrupts absolutely: conservative online speaker diarisation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Kwon%2C+Y">Youngki Kwon</a>, <a href="/search/eess?searchtype=author&amp;query=Heo%2C+H">Hee-Soo Heo</a>, <a href="/search/eess?searchtype=author&amp;query=Lee%2C+B">Bong-Jin Lee</a>, <a href="/search/eess?searchtype=author&amp;query=Kim%2C+Y+J">You Jin Kim</a>, <a href="/search/eess?searchtype=author&amp;query=Jung%2C+J">Jee-weon Jung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2211.04768v1-abstract-short" style="display: inline;"> Our focus lies in developing an online speaker diarisation framework which demonstrates robust performance across diverse domains. In online speaker diarisation, outputs generated in real-time are irreversible, and a few misjudgements in the early phase of an input session can lead to catastrophic results. We hypothesise that cautiously increasing the number of estimated speakers is of paramount i&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.04768v1-abstract-full').style.display = 'inline'; document.getElementById('2211.04768v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2211.04768v1-abstract-full" style="display: none;"> Our focus lies in developing an online speaker diarisation framework which demonstrates robust performance across diverse domains. In online speaker diarisation, outputs generated in real-time are irreversible, and a few misjudgements in the early phase of an input session can lead to catastrophic results. We hypothesise that cautiously increasing the number of estimated speakers is of paramount importance among many other factors. Thus, our proposed framework includes decreasing the number of speakers by one when the system judges that an increase in the past was faulty. We also adopt dual buffers, checkpoints and centroids, where checkpoints are combined with silhouette coefficients to estimate the number of speakers and centroids represent speakers. Again, we believe that more than one centroid can be generated from one speaker. Thus we design a clustering-based label matching technique to assign labels in real-time. The resulting system is lightweight yet surprisingly effective. The system demonstrates state-of-the-art performance on DIHARD 2 and 3 datasets, where it is also competitive in AMI and VoxConverse test sets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.04768v1-abstract-full').style.display = 'none'; document.getElementById('2211.04768v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5pages, 2 figure, 4 tables, submitted to ICASSP</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2211.04060">arXiv:2211.04060</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2211.04060">pdf</a>, <a href="https://arxiv.org/format/2211.04060">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> High-resolution embedding extractor for speaker diarisation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Heo%2C+H">Hee-Soo Heo</a>, <a href="/search/eess?searchtype=author&amp;query=Kwon%2C+Y">Youngki Kwon</a>, <a href="/search/eess?searchtype=author&amp;query=Lee%2C+B">Bong-Jin Lee</a>, <a href="/search/eess?searchtype=author&amp;query=Kim%2C+Y+J">You Jin Kim</a>, <a href="/search/eess?searchtype=author&amp;query=Jung%2C+J">Jee-weon Jung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2211.04060v1-abstract-short" style="display: inline;"> Speaker embedding extractors significantly influence the performance of clustering-based speaker diarisation systems. Conventionally, only one embedding is extracted from each speech segment. However, because of the sliding window approach, a segment easily includes two or more speakers owing to speaker change points. This study proposes a novel embedding extractor architecture, referred to as a h&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.04060v1-abstract-full').style.display = 'inline'; document.getElementById('2211.04060v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2211.04060v1-abstract-full" style="display: none;"> Speaker embedding extractors significantly influence the performance of clustering-based speaker diarisation systems. Conventionally, only one embedding is extracted from each speech segment. However, because of the sliding window approach, a segment easily includes two or more speakers owing to speaker change points. This study proposes a novel embedding extractor architecture, referred to as a high-resolution embedding extractor (HEE), which extracts multiple high-resolution embeddings from each speech segment. Hee consists of a feature-map extractor and an enhancer, where the enhancer with the self-attention mechanism is the key to success. The enhancer of HEE replaces the aggregation process; instead of a global pooling layer, the enhancer combines relative information to each frame via attention leveraging the global context. Extracted dense frame-level embeddings can each represent a speaker. Thus, multiple speakers can be represented by different frame-level features in each segment. We also propose an artificially generating mixture data training framework to train the proposed HEE. Through experiments on five evaluation sets, including four public datasets, the proposed HEE demonstrates at least 10% improvement on each evaluation set, except for one dataset, which we analyse that rapid speaker changes less exist. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.04060v1-abstract-full').style.display = 'none'; document.getElementById('2211.04060v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5pages, 2 figure, 3 tables, submitted to ICASSP</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2211.00437">arXiv:2211.00437</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2211.00437">pdf</a>, <a href="https://arxiv.org/format/2211.00437">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Disentangled representation learning for multilingual speaker recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Nam%2C+K">Kihyun Nam</a>, <a href="/search/eess?searchtype=author&amp;query=Kim%2C+Y">Youkyum Kim</a>, <a href="/search/eess?searchtype=author&amp;query=Huh%2C+J">Jaesung Huh</a>, <a href="/search/eess?searchtype=author&amp;query=Heo%2C+H+S">Hee Soo Heo</a>, <a href="/search/eess?searchtype=author&amp;query=Jung%2C+J">Jee-weon Jung</a>, <a href="/search/eess?searchtype=author&amp;query=Chung%2C+J+S">Joon Son Chung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2211.00437v3-abstract-short" style="display: inline;"> The goal of this paper is to learn robust speaker representation for bilingual speaking scenario. The majority of the world&#39;s population speak at least two languages; however, most speaker recognition systems fail to recognise the same speaker when speaking in different languages. Popular speaker recognition evaluation sets do not consider the bilingual scenario, making it difficult to analyse t&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.00437v3-abstract-full').style.display = 'inline'; document.getElementById('2211.00437v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2211.00437v3-abstract-full" style="display: none;"> The goal of this paper is to learn robust speaker representation for bilingual speaking scenario. The majority of the world&#39;s population speak at least two languages; however, most speaker recognition systems fail to recognise the same speaker when speaking in different languages. Popular speaker recognition evaluation sets do not consider the bilingual scenario, making it difficult to analyse the effect of bilingual speakers on speaker recognition performance. In this paper, we publish a large-scale evaluation set named VoxCeleb1-B derived from VoxCeleb that considers bilingual scenarios. We introduce an effective disentanglement learning strategy that combines adversarial and metric learning-based methods. This approach addresses the bilingual situation by disentangling language-related information from speaker representation while ensuring stable speaker representation learning. Our language-disentangled learning method only uses language pseudo-labels without manual information. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.00437v3-abstract-full').style.display = 'none'; document.getElementById('2211.00437v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Interspeech 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2210.14682">arXiv:2210.14682</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2210.14682">pdf</a>, <a href="https://arxiv.org/format/2210.14682">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> In search of strong embedding extractors for speaker diarisation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Jung%2C+J">Jee-weon Jung</a>, <a href="/search/eess?searchtype=author&amp;query=Heo%2C+H">Hee-Soo Heo</a>, <a href="/search/eess?searchtype=author&amp;query=Lee%2C+B">Bong-Jin Lee</a>, <a href="/search/eess?searchtype=author&amp;query=Huh%2C+J">Jaesung Huh</a>, <a href="/search/eess?searchtype=author&amp;query=Brown%2C+A">Andrew Brown</a>, <a href="/search/eess?searchtype=author&amp;query=Kwon%2C+Y">Youngki Kwon</a>, <a href="/search/eess?searchtype=author&amp;query=Watanabe%2C+S">Shinji Watanabe</a>, <a href="/search/eess?searchtype=author&amp;query=Chung%2C+J+S">Joon Son Chung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2210.14682v1-abstract-short" style="display: inline;"> Speaker embedding extractors (EEs), which map input audio to a speaker discriminant latent space, are of paramount importance in speaker diarisation. However, there are several challenges when adopting EEs for diarisation, from which we tackle two key problems. First, the evaluation is not straightforward because the features required for better performance differ between speaker verification and&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.14682v1-abstract-full').style.display = 'inline'; document.getElementById('2210.14682v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2210.14682v1-abstract-full" style="display: none;"> Speaker embedding extractors (EEs), which map input audio to a speaker discriminant latent space, are of paramount importance in speaker diarisation. However, there are several challenges when adopting EEs for diarisation, from which we tackle two key problems. First, the evaluation is not straightforward because the features required for better performance differ between speaker verification and diarisation. We show that better performance on widely adopted speaker verification evaluation protocols does not lead to better diarisation performance. Second, embedding extractors have not seen utterances in which multiple speakers exist. These inputs are inevitably present in speaker diarisation because of overlapped speech and speaker changes; they degrade the performance. To mitigate the first problem, we generate speaker verification evaluation protocols that mimic the diarisation scenario better. We propose two data augmentation techniques to alleviate the second problem, making embedding extractors aware of overlapped speech or speaker change input. One technique generates overlapped speech segments, and the other generates segments where two speakers utter sequentially. Extensive experimental results using three state-of-the-art speaker embedding extractors demonstrate that both proposed approaches are effective. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.14682v1-abstract-full').style.display = 'none'; document.getElementById('2210.14682v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5pages, 1 figure, 2 tables, submitted to ICASSP</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2210.10985">arXiv:2210.10985</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2210.10985">pdf</a>, <a href="https://arxiv.org/ps/2210.10985">ps</a>, <a href="https://arxiv.org/format/2210.10985">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Large-scale learning of generalised representations for speaker recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Jung%2C+J">Jee-weon Jung</a>, <a href="/search/eess?searchtype=author&amp;query=Heo%2C+H">Hee-Soo Heo</a>, <a href="/search/eess?searchtype=author&amp;query=Lee%2C+B">Bong-Jin Lee</a>, <a href="/search/eess?searchtype=author&amp;query=Lee%2C+J">Jaesong Lee</a>, <a href="/search/eess?searchtype=author&amp;query=Shim%2C+H">Hye-jin Shim</a>, <a href="/search/eess?searchtype=author&amp;query=Kwon%2C+Y">Youngki Kwon</a>, <a href="/search/eess?searchtype=author&amp;query=Chung%2C+J+S">Joon Son Chung</a>, <a href="/search/eess?searchtype=author&amp;query=Watanabe%2C+S">Shinji Watanabe</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2210.10985v2-abstract-short" style="display: inline;"> The objective of this work is to develop a speaker recognition model to be used in diverse scenarios. We hypothesise that two components should be adequately configured to build such a model. First, adequate architecture would be required. We explore several recent state-of-the-art models, including ECAPA-TDNN and MFA-Conformer, as well as other baselines. Second, a massive amount of data would be&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.10985v2-abstract-full').style.display = 'inline'; document.getElementById('2210.10985v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2210.10985v2-abstract-full" style="display: none;"> The objective of this work is to develop a speaker recognition model to be used in diverse scenarios. We hypothesise that two components should be adequately configured to build such a model. First, adequate architecture would be required. We explore several recent state-of-the-art models, including ECAPA-TDNN and MFA-Conformer, as well as other baselines. Second, a massive amount of data would be required. We investigate several new training data configurations combining a few existing datasets. The most extensive configuration includes over 87k speakers&#39; 10.22k hours of speech. Four evaluation protocols are adopted to measure how the trained model performs in diverse scenarios. Through experiments, we find that MFA-Conformer with the least inductive bias generalises the best. We also show that training with proposed large data configurations gives better performance. A boost in generalisation is observed, where the average performance on four evaluation protocols improves by more than 20%. In addition, we also demonstrate that these models&#39; performances can improve even further when increasing capacity. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.10985v2-abstract-full').style.display = 'none'; document.getElementById('2210.10985v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5pages, 5 tables, submitted to ICASSP</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2206.04383">arXiv:2206.04383</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2206.04383">pdf</a>, <a href="https://arxiv.org/format/2206.04383">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Medical Physics">physics.med-ph</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1007/978-3-031-16446-0_37">10.1007/978-3-031-16446-0_37 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Only-Train-Once MR Fingerprinting for Magnetization Transfer Contrast Quantification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Kang%2C+B">Beomgu Kang</a>, <a href="/search/eess?searchtype=author&amp;query=Heo%2C+H">Hye-Young Heo</a>, <a href="/search/eess?searchtype=author&amp;query=Park%2C+H">HyunWook Park</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2206.04383v1-abstract-short" style="display: inline;"> Magnetization transfer contrast magnetic resonance fingerprinting (MTC-MRF) is a novel quantitative imaging technique that simultaneously measures several tissue parameters of semisolid macromolecule and free bulk water. In this study, we propose an Only-Train-Once MR fingerprinting (OTOM) framework that estimates the free bulk water and MTC tissue parameters from MR fingerprints regardless of MRF&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.04383v1-abstract-full').style.display = 'inline'; document.getElementById('2206.04383v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2206.04383v1-abstract-full" style="display: none;"> Magnetization transfer contrast magnetic resonance fingerprinting (MTC-MRF) is a novel quantitative imaging technique that simultaneously measures several tissue parameters of semisolid macromolecule and free bulk water. In this study, we propose an Only-Train-Once MR fingerprinting (OTOM) framework that estimates the free bulk water and MTC tissue parameters from MR fingerprints regardless of MRF schedule, thereby avoiding time-consuming process such as generation of training dataset and network training according to each MRF schedule. A recurrent neural network is designed to cope with two types of variants of MRF schedules: 1) various lengths and 2) various patterns. Experiments on digital phantoms and in vivo data demonstrate that our approach can achieve accurate quantification for the water and MTC parameters with multiple MRF schedules. Moreover, the proposed method is in excellent agreement with the conventional deep learning and fitting methods. The flexible OTOM framework could be an efficient tissue quantification tool for various MRF protocols. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.04383v1-abstract-full').style.display = 'none'; document.getElementById('2206.04383v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 June, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at 25th International Conference on Medical Image Computing and Computer Assisted Intervention (MICCAI&#39;22)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2204.09976">arXiv:2204.09976</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2204.09976">pdf</a>, <a href="https://arxiv.org/format/2204.09976">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Baseline Systems for the First Spoofing-Aware Speaker Verification Challenge: Score and Embedding Fusion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Shim%2C+H">Hye-jin Shim</a>, <a href="/search/eess?searchtype=author&amp;query=Tak%2C+H">Hemlata Tak</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+X">Xuechen Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Heo%2C+H">Hee-Soo Heo</a>, <a href="/search/eess?searchtype=author&amp;query=Jung%2C+J">Jee-weon Jung</a>, <a href="/search/eess?searchtype=author&amp;query=Chung%2C+J+S">Joon Son Chung</a>, <a href="/search/eess?searchtype=author&amp;query=Chung%2C+S">Soo-Whan Chung</a>, <a href="/search/eess?searchtype=author&amp;query=Yu%2C+H">Ha-Jin Yu</a>, <a href="/search/eess?searchtype=author&amp;query=Lee%2C+B">Bong-Jin Lee</a>, <a href="/search/eess?searchtype=author&amp;query=Todisco%2C+M">Massimiliano Todisco</a>, <a href="/search/eess?searchtype=author&amp;query=Delgado%2C+H">H茅ctor Delgado</a>, <a href="/search/eess?searchtype=author&amp;query=Lee%2C+K+A">Kong Aik Lee</a>, <a href="/search/eess?searchtype=author&amp;query=Sahidullah%2C+M">Md Sahidullah</a>, <a href="/search/eess?searchtype=author&amp;query=Kinnunen%2C+T">Tomi Kinnunen</a>, <a href="/search/eess?searchtype=author&amp;query=Evans%2C+N">Nicholas Evans</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2204.09976v1-abstract-short" style="display: inline;"> Deep learning has brought impressive progress in the study of both automatic speaker verification (ASV) and spoofing countermeasures (CM). Although solutions are mutually dependent, they have typically evolved as standalone sub-systems whereby CM solutions are usually designed for a fixed ASV system. The work reported in this paper aims to gauge the improvements in reliability that can be gained f&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2204.09976v1-abstract-full').style.display = 'inline'; document.getElementById('2204.09976v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2204.09976v1-abstract-full" style="display: none;"> Deep learning has brought impressive progress in the study of both automatic speaker verification (ASV) and spoofing countermeasures (CM). Although solutions are mutually dependent, they have typically evolved as standalone sub-systems whereby CM solutions are usually designed for a fixed ASV system. The work reported in this paper aims to gauge the improvements in reliability that can be gained from their closer integration. Results derived using the popular ASVspoof2019 dataset indicate that the equal error rate (EER) of a state-of-the-art ASV system degrades from 1.63% to 23.83% when the evaluation protocol is extended with spoofed trials.%subjected to spoofing attacks. However, even the straightforward integration of ASV and CM systems in the form of score-sum and deep neural network-based fusion strategies reduce the EER to 1.71% and 6.37%, respectively. The new Spoofing-Aware Speaker Verification (SASV) challenge has been formed to encourage greater attention to the integration of ASV and CM systems as well as to provide a means to benchmark different solutions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2204.09976v1-abstract-full').style.display = 'none'; document.getElementById('2204.09976v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 April, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, accepted by Odyssey 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2203.14732">arXiv:2203.14732</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2203.14732">pdf</a>, <a href="https://arxiv.org/format/2203.14732">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> SASV 2022: The First Spoofing-Aware Speaker Verification Challenge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Jung%2C+J">Jee-weon Jung</a>, <a href="/search/eess?searchtype=author&amp;query=Tak%2C+H">Hemlata Tak</a>, <a href="/search/eess?searchtype=author&amp;query=Shim%2C+H">Hye-jin Shim</a>, <a href="/search/eess?searchtype=author&amp;query=Heo%2C+H">Hee-Soo Heo</a>, <a href="/search/eess?searchtype=author&amp;query=Lee%2C+B">Bong-Jin Lee</a>, <a href="/search/eess?searchtype=author&amp;query=Chung%2C+S">Soo-Whan Chung</a>, <a href="/search/eess?searchtype=author&amp;query=Yu%2C+H">Ha-Jin Yu</a>, <a href="/search/eess?searchtype=author&amp;query=Evans%2C+N">Nicholas Evans</a>, <a href="/search/eess?searchtype=author&amp;query=Kinnunen%2C+T">Tomi Kinnunen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2203.14732v1-abstract-short" style="display: inline;"> The first spoofing-aware speaker verification (SASV) challenge aims to integrate research efforts in speaker verification and anti-spoofing. We extend the speaker verification scenario by introducing spoofed trials to the usual set of target and impostor trials. In contrast to the established ASVspoof challenge where the focus is upon separate, independently optimised spoofing detection and speake&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.14732v1-abstract-full').style.display = 'inline'; document.getElementById('2203.14732v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2203.14732v1-abstract-full" style="display: none;"> The first spoofing-aware speaker verification (SASV) challenge aims to integrate research efforts in speaker verification and anti-spoofing. We extend the speaker verification scenario by introducing spoofed trials to the usual set of target and impostor trials. In contrast to the established ASVspoof challenge where the focus is upon separate, independently optimised spoofing detection and speaker verification sub-systems, SASV targets the development of integrated and jointly optimised solutions. Pre-trained spoofing detection and speaker verification models are provided as open source and are used in two baseline SASV solutions. Both models and baselines are freely available to participants and can be used to develop back-end fusion approaches or end-to-end solutions. Using the provided common evaluation protocol, 23 teams submitted SASV solutions. When assessed with target, bona fide non-target and spoofed non-target trials, the top-performing system reduces the equal error rate of a conventional speaker verification system from 23.83% to 0.13%. SASV challenge results are a testament to the reliability of today&#39;s state-of-the-art approaches to spoofing detection and speaker verification. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.14732v1-abstract-full').style.display = 'none'; document.getElementById('2203.14732v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 March, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 2 figures, 2 tables, submitted to Interspeech 2022 as a conference paper</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2203.14525">arXiv:2203.14525</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2203.14525">pdf</a>, <a href="https://arxiv.org/format/2203.14525">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Curriculum learning for self-supervised speaker verification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Heo%2C+H">Hee-Soo Heo</a>, <a href="/search/eess?searchtype=author&amp;query=Jung%2C+J">Jee-weon Jung</a>, <a href="/search/eess?searchtype=author&amp;query=Kang%2C+J">Jingu Kang</a>, <a href="/search/eess?searchtype=author&amp;query=Kwon%2C+Y">Youngki Kwon</a>, <a href="/search/eess?searchtype=author&amp;query=Kim%2C+Y+J">You Jin Kim</a>, <a href="/search/eess?searchtype=author&amp;query=Lee%2C+B">Bong-Jin Lee</a>, <a href="/search/eess?searchtype=author&amp;query=Chung%2C+J+S">Joon Son Chung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2203.14525v4-abstract-short" style="display: inline;"> The goal of this paper is to train effective self-supervised speaker representations without identity labels. We propose two curriculum learning strategies within a self-supervised learning framework. The first strategy aims to gradually increase the number of speakers in the training phase by enlarging the used portion of the train dataset. The second strategy applies various data augmentations t&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.14525v4-abstract-full').style.display = 'inline'; document.getElementById('2203.14525v4-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2203.14525v4-abstract-full" style="display: none;"> The goal of this paper is to train effective self-supervised speaker representations without identity labels. We propose two curriculum learning strategies within a self-supervised learning framework. The first strategy aims to gradually increase the number of speakers in the training phase by enlarging the used portion of the train dataset. The second strategy applies various data augmentations to more utterances within a mini-batch as the training proceeds. A range of experiments conducted using the DINO self-supervised framework on the VoxCeleb1 evaluation protocol demonstrates the effectiveness of our proposed curriculum learning strategies. We report a competitive equal error rate of 4.47% with a single-phase training, and we also demonstrate that the performance further improves to 1.84% by fine-tuning on a small labelled dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.14525v4-abstract-full').style.display = 'none'; document.getElementById('2203.14525v4-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 March, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">INTERSPEECH 2023. 5 pages, 3 figures, 4 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2203.08488">arXiv:2203.08488</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2203.08488">pdf</a>, <a href="https://arxiv.org/format/2203.08488">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Pushing the limits of raw waveform speaker recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Jung%2C+J">Jee-weon Jung</a>, <a href="/search/eess?searchtype=author&amp;query=Kim%2C+Y+J">You Jin Kim</a>, <a href="/search/eess?searchtype=author&amp;query=Heo%2C+H">Hee-Soo Heo</a>, <a href="/search/eess?searchtype=author&amp;query=Lee%2C+B">Bong-Jin Lee</a>, <a href="/search/eess?searchtype=author&amp;query=Kwon%2C+Y">Youngki Kwon</a>, <a href="/search/eess?searchtype=author&amp;query=Chung%2C+J+S">Joon Son Chung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2203.08488v2-abstract-short" style="display: inline;"> In recent years, speaker recognition systems based on raw waveform inputs have received increasing attention. However, the performance of such systems are typically inferior to the state-of-the-art handcrafted feature-based counterparts, which demonstrate equal error rates under 1% on the popular VoxCeleb1 test set. This paper proposes a novel speaker recognition model based on raw waveform inputs&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.08488v2-abstract-full').style.display = 'inline'; document.getElementById('2203.08488v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2203.08488v2-abstract-full" style="display: none;"> In recent years, speaker recognition systems based on raw waveform inputs have received increasing attention. However, the performance of such systems are typically inferior to the state-of-the-art handcrafted feature-based counterparts, which demonstrate equal error rates under 1% on the popular VoxCeleb1 test set. This paper proposes a novel speaker recognition model based on raw waveform inputs. The model incorporates recent advances in machine learning and speaker verification, including the Res2Net backbone module and multi-layer feature aggregation. Our best model achieves an equal error rate of 0.89%, which is competitive with the state-of-the-art models based on handcrafted features, and outperforms the best model based on raw waveform inputs by a large margin. We also explore the application of the proposed model in the context of self-supervised learning framework. Our self-supervised model outperforms single phase-based existing works in this line of research. Finally, we show that self-supervised pre-training is effective for the semi-supervised scenario where we only have a small set of labelled training data, along with a larger set of unlabelled examples. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.08488v2-abstract-full').style.display = 'none'; document.getElementById('2203.08488v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 March, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 March, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">submitted to INTERSPEECH 2022 as a conference paper. 5 pages, 2 figures, 5 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2201.10283">arXiv:2201.10283</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2201.10283">pdf</a>, <a href="https://arxiv.org/ps/2201.10283">ps</a>, <a href="https://arxiv.org/format/2201.10283">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> SASV Challenge 2022: A Spoofing Aware Speaker Verification Challenge Evaluation Plan </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Jung%2C+J">Jee-weon Jung</a>, <a href="/search/eess?searchtype=author&amp;query=Tak%2C+H">Hemlata Tak</a>, <a href="/search/eess?searchtype=author&amp;query=Shim%2C+H">Hye-jin Shim</a>, <a href="/search/eess?searchtype=author&amp;query=Heo%2C+H">Hee-Soo Heo</a>, <a href="/search/eess?searchtype=author&amp;query=Lee%2C+B">Bong-Jin Lee</a>, <a href="/search/eess?searchtype=author&amp;query=Chung%2C+S">Soo-Whan Chung</a>, <a href="/search/eess?searchtype=author&amp;query=Kang%2C+H">Hong-Goo Kang</a>, <a href="/search/eess?searchtype=author&amp;query=Yu%2C+H">Ha-Jin Yu</a>, <a href="/search/eess?searchtype=author&amp;query=Evans%2C+N">Nicholas Evans</a>, <a href="/search/eess?searchtype=author&amp;query=Kinnunen%2C+T">Tomi Kinnunen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2201.10283v2-abstract-short" style="display: inline;"> ASV (automatic speaker verification) systems are intrinsically required to reject both non-target (e.g., voice uttered by different speaker) and spoofed (e.g., synthesised or converted) inputs. However, there is little consideration for how ASV systems themselves should be adapted when they are expected to encounter spoofing attacks, nor when they operate in tandem with CMs (spoofing countermeasur&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2201.10283v2-abstract-full').style.display = 'inline'; document.getElementById('2201.10283v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2201.10283v2-abstract-full" style="display: none;"> ASV (automatic speaker verification) systems are intrinsically required to reject both non-target (e.g., voice uttered by different speaker) and spoofed (e.g., synthesised or converted) inputs. However, there is little consideration for how ASV systems themselves should be adapted when they are expected to encounter spoofing attacks, nor when they operate in tandem with CMs (spoofing countermeasures), much less how both systems should be jointly optimised. The goal of the first SASV (spoofing-aware speaker verification) challenge, a special sesscion in ISCA INTERSPEECH 2022, is to promote development of integrated systems that can perform ASV and CM simultaneously. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2201.10283v2-abstract-full').style.display = 'none'; document.getElementById('2201.10283v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 March, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 January, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Evaluation plan of the SASV Challenge 2022. See this webpage for more information: https://sasv-challenge.github.io</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2110.14513">arXiv:2110.14513</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2110.14513">pdf</a>, <a href="https://arxiv.org/format/2110.14513">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Neural Analysis and Synthesis: Reconstructing Speech from Self-Supervised Representations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Choi%2C+H">Hyeong-Seok Choi</a>, <a href="/search/eess?searchtype=author&amp;query=Lee%2C+J">Juheon Lee</a>, <a href="/search/eess?searchtype=author&amp;query=Kim%2C+W">Wansoo Kim</a>, <a href="/search/eess?searchtype=author&amp;query=Lee%2C+J+H">Jie Hwan Lee</a>, <a href="/search/eess?searchtype=author&amp;query=Heo%2C+H">Hoon Heo</a>, <a href="/search/eess?searchtype=author&amp;query=Lee%2C+K">Kyogu Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2110.14513v2-abstract-short" style="display: inline;"> We present a neural analysis and synthesis (NANSY) framework that can manipulate voice, pitch, and speed of an arbitrary speech signal. Most of the previous works have focused on using information bottleneck to disentangle analysis features for controllable synthesis, which usually results in poor reconstruction quality. We address this issue by proposing a novel training strategy based on informa&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.14513v2-abstract-full').style.display = 'inline'; document.getElementById('2110.14513v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2110.14513v2-abstract-full" style="display: none;"> We present a neural analysis and synthesis (NANSY) framework that can manipulate voice, pitch, and speed of an arbitrary speech signal. Most of the previous works have focused on using information bottleneck to disentangle analysis features for controllable synthesis, which usually results in poor reconstruction quality. We address this issue by proposing a novel training strategy based on information perturbation. The idea is to perturb information in the original input signal (e.g., formant, pitch, and frequency response), thereby letting synthesis networks selectively take essential attributes to reconstruct the input signal. Because NANSY does not need any bottleneck structures, it enjoys both high reconstruction quality and controllability. Furthermore, NANSY does not require any labels associated with speech data such as text and speaker information, but rather uses a new set of analysis features, i.e., wav2vec feature and newly proposed pitch feature, Yingram, which allows for fully self-supervised training. Taking advantage of fully self-supervised training, NANSY can be easily extended to a multilingual setting by simply training it with a multilingual dataset. The experiments show that NANSY can achieve significant improvement in performance in several applications such as zero-shot voice conversion, pitch shift, and time-scale modification. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.14513v2-abstract-full').style.display = 'none'; document.getElementById('2110.14513v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 October, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Neural Information Processing Systems (NeurIPS) 2021</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2110.03361">arXiv:2110.03361</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2110.03361">pdf</a>, <a href="https://arxiv.org/format/2110.03361">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Multi-scale speaker embedding-based graph attention networks for speaker diarisation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Kwon%2C+Y">Youngki Kwon</a>, <a href="/search/eess?searchtype=author&amp;query=Heo%2C+H">Hee-Soo Heo</a>, <a href="/search/eess?searchtype=author&amp;query=Jung%2C+J">Jee-weon Jung</a>, <a href="/search/eess?searchtype=author&amp;query=Kim%2C+Y+J">You Jin Kim</a>, <a href="/search/eess?searchtype=author&amp;query=Lee%2C+B">Bong-Jin Lee</a>, <a href="/search/eess?searchtype=author&amp;query=Chung%2C+J+S">Joon Son Chung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2110.03361v1-abstract-short" style="display: inline;"> The objective of this work is effective speaker diarisation using multi-scale speaker embeddings. Typically, there is a trade-off between the ability to recognise short speaker segments and the discriminative power of the embedding, according to the segment length used for embedding extraction. To this end, recent works have proposed the use of multi-scale embeddings where segments with varying le&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.03361v1-abstract-full').style.display = 'inline'; document.getElementById('2110.03361v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2110.03361v1-abstract-full" style="display: none;"> The objective of this work is effective speaker diarisation using multi-scale speaker embeddings. Typically, there is a trade-off between the ability to recognise short speaker segments and the discriminative power of the embedding, according to the segment length used for embedding extraction. To this end, recent works have proposed the use of multi-scale embeddings where segments with varying lengths are used. However, the scores are combined using a weighted summation scheme where the weights are fixed after the training phase, whereas the importance of segment lengths can differ with in a single session. To address this issue, we present three key contributions in this paper: (1) we propose graph attention networks for multi-scale speaker diarisation; (2) we design scale indicators to utilise scale information of each embedding; (3) we adapt the attention-based aggregation to utilise a pre-computed affinity matrix from multi-scale embeddings. We demonstrate the effectiveness of our method in various datasets where the speaker confusion which constitutes the primary metric drops over 10% in average relative compared to the baseline. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.03361v1-abstract-full').style.display = 'none'; document.getElementById('2110.03361v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 October, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 2 figures, submitted to ICASSP as a conference paper</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2110.01200">arXiv:2110.01200</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2110.01200">pdf</a>, <a href="https://arxiv.org/format/2110.01200">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> AASIST: Audio Anti-Spoofing using Integrated Spectro-Temporal Graph Attention Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Jung%2C+J">Jee-weon Jung</a>, <a href="/search/eess?searchtype=author&amp;query=Heo%2C+H">Hee-Soo Heo</a>, <a href="/search/eess?searchtype=author&amp;query=Tak%2C+H">Hemlata Tak</a>, <a href="/search/eess?searchtype=author&amp;query=Shim%2C+H">Hye-jin Shim</a>, <a href="/search/eess?searchtype=author&amp;query=Chung%2C+J+S">Joon Son Chung</a>, <a href="/search/eess?searchtype=author&amp;query=Lee%2C+B">Bong-Jin Lee</a>, <a href="/search/eess?searchtype=author&amp;query=Yu%2C+H">Ha-Jin Yu</a>, <a href="/search/eess?searchtype=author&amp;query=Evans%2C+N">Nicholas Evans</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2110.01200v1-abstract-short" style="display: inline;"> Artefacts that differentiate spoofed from bona-fide utterances can reside in spectral or temporal domains. Their reliable detection usually depends upon computationally demanding ensemble systems where each subsystem is tuned to some specific artefacts. We seek to develop an efficient, single system that can detect a broad range of different spoofing attacks without score-level ensembles. We propo&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.01200v1-abstract-full').style.display = 'inline'; document.getElementById('2110.01200v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2110.01200v1-abstract-full" style="display: none;"> Artefacts that differentiate spoofed from bona-fide utterances can reside in spectral or temporal domains. Their reliable detection usually depends upon computationally demanding ensemble systems where each subsystem is tuned to some specific artefacts. We seek to develop an efficient, single system that can detect a broad range of different spoofing attacks without score-level ensembles. We propose a novel heterogeneous stacking graph attention layer which models artefacts spanning heterogeneous temporal and spectral domains with a heterogeneous attention mechanism and a stack node. With a new max graph operation that involves a competitive mechanism and an extended readout scheme, our approach, named AASIST, outperforms the current state-of-the-art by 20% relative. Even a lightweight variant, AASIST-L, with only 85K parameters, outperforms all competing systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.01200v1-abstract-full').style.display = 'none'; document.getElementById('2110.01200v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 October, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 1 figure, 3 tables, submitted to ICASSP2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2108.07640">arXiv:2108.07640</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2108.07640">pdf</a>, <a href="https://arxiv.org/format/2108.07640">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Look Who&#39;s Talking: Active Speaker Detection in the Wild </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Kim%2C+Y+J">You Jin Kim</a>, <a href="/search/eess?searchtype=author&amp;query=Heo%2C+H">Hee-Soo Heo</a>, <a href="/search/eess?searchtype=author&amp;query=Choe%2C+S">Soyeon Choe</a>, <a href="/search/eess?searchtype=author&amp;query=Chung%2C+S">Soo-Whan Chung</a>, <a href="/search/eess?searchtype=author&amp;query=Kwon%2C+Y">Yoohwan Kwon</a>, <a href="/search/eess?searchtype=author&amp;query=Lee%2C+B">Bong-Jin Lee</a>, <a href="/search/eess?searchtype=author&amp;query=Kwon%2C+Y">Youngki Kwon</a>, <a href="/search/eess?searchtype=author&amp;query=Chung%2C+J+S">Joon Son Chung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2108.07640v1-abstract-short" style="display: inline;"> In this work, we present a novel audio-visual dataset for active speaker detection in the wild. A speaker is considered active when his or her face is visible and the voice is audible simultaneously. Although active speaker detection is a crucial pre-processing step for many audio-visual tasks, there is no existing dataset of natural human speech to evaluate the performance of active speaker detec&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2108.07640v1-abstract-full').style.display = 'inline'; document.getElementById('2108.07640v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2108.07640v1-abstract-full" style="display: none;"> In this work, we present a novel audio-visual dataset for active speaker detection in the wild. A speaker is considered active when his or her face is visible and the voice is audible simultaneously. Although active speaker detection is a crucial pre-processing step for many audio-visual tasks, there is no existing dataset of natural human speech to evaluate the performance of active speaker detection. We therefore curate the Active Speakers in the Wild (ASW) dataset which contains videos and co-occurring speech segments with dense speech activity labels. Videos and timestamps of audible segments are parsed and adopted from VoxConverse, an existing speaker diarisation dataset that consists of videos in the wild. Face tracks are extracted from the videos and active segments are annotated based on the timestamps of VoxConverse in a semi-automatic way. Two reference systems, a self-supervised system and a fully supervised one, are evaluated on the dataset to provide the baseline performances of ASW. Cross-domain evaluation is conducted in order to show the negative effect of dubbed videos in the training data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2108.07640v1-abstract-full').style.display = 'none'; document.getElementById('2108.07640v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 August, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To appear in Interspeech 2021. Data will be available from https://github.com/clovaai/lookwhostalking</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2104.02879">arXiv:2104.02879</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2104.02879">pdf</a>, <a href="https://arxiv.org/format/2104.02879">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Adapting Speaker Embeddings for Speaker Diarisation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Kwon%2C+Y">Youngki Kwon</a>, <a href="/search/eess?searchtype=author&amp;query=Jung%2C+J">Jee-weon Jung</a>, <a href="/search/eess?searchtype=author&amp;query=Heo%2C+H">Hee-Soo Heo</a>, <a href="/search/eess?searchtype=author&amp;query=Kim%2C+Y+J">You Jin Kim</a>, <a href="/search/eess?searchtype=author&amp;query=Lee%2C+B">Bong-Jin Lee</a>, <a href="/search/eess?searchtype=author&amp;query=Chung%2C+J+S">Joon Son Chung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2104.02879v1-abstract-short" style="display: inline;"> The goal of this paper is to adapt speaker embeddings for solving the problem of speaker diarisation. The quality of speaker embeddings is paramount to the performance of speaker diarisation systems. Despite this, prior works in the field have directly used embeddings designed only to be effective on the speaker verification task. In this paper, we propose three techniques that can be used to bett&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2104.02879v1-abstract-full').style.display = 'inline'; document.getElementById('2104.02879v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2104.02879v1-abstract-full" style="display: none;"> The goal of this paper is to adapt speaker embeddings for solving the problem of speaker diarisation. The quality of speaker embeddings is paramount to the performance of speaker diarisation systems. Despite this, prior works in the field have directly used embeddings designed only to be effective on the speaker verification task. In this paper, we propose three techniques that can be used to better adapt the speaker embeddings for diarisation: dimensionality reduction, attention-based embedding aggregation, and non-speech clustering. A wide range of experiments is performed on various challenging datasets. The results demonstrate that all three techniques contribute positively to the performance of the diarisation system achieving an average relative improvement of 25.07% in terms of diarisation error rate over the baseline. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2104.02879v1-abstract-full').style.display = 'none'; document.getElementById('2104.02879v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 April, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 2 figures, 3 tables, submitted to Interspeech as a conference paper</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2104.02878">arXiv:2104.02878</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2104.02878">pdf</a>, <a href="https://arxiv.org/format/2104.02878">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Three-class Overlapped Speech Detection using a Convolutional Recurrent Neural Network </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Jung%2C+J">Jee-weon Jung</a>, <a href="/search/eess?searchtype=author&amp;query=Heo%2C+H">Hee-Soo Heo</a>, <a href="/search/eess?searchtype=author&amp;query=Kwon%2C+Y">Youngki Kwon</a>, <a href="/search/eess?searchtype=author&amp;query=Chung%2C+J+S">Joon Son Chung</a>, <a href="/search/eess?searchtype=author&amp;query=Lee%2C+B">Bong-Jin Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2104.02878v1-abstract-short" style="display: inline;"> In this work, we propose an overlapped speech detection system trained as a three-class classifier. Unlike conventional systems that perform binary classification as to whether or not a frame contains overlapped speech, the proposed approach classifies into three classes: non-speech, single speaker speech, and overlapped speech. By training a network with the more detailed label definition, the mo&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2104.02878v1-abstract-full').style.display = 'inline'; document.getElementById('2104.02878v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2104.02878v1-abstract-full" style="display: none;"> In this work, we propose an overlapped speech detection system trained as a three-class classifier. Unlike conventional systems that perform binary classification as to whether or not a frame contains overlapped speech, the proposed approach classifies into three classes: non-speech, single speaker speech, and overlapped speech. By training a network with the more detailed label definition, the model can learn a better notion on deciding the number of speakers included in a given frame. A convolutional recurrent neural network architecture is explored to benefit from both convolutional layer&#39;s capability to model local patterns and recurrent layer&#39;s ability to model sequential information. The proposed overlapped speech detection model establishes a state-of-the-art performance with a precision of 0.6648 and a recall of 0.3222 on the DIHARD II evaluation set, showing a 20% increase in recall along with higher precision. In addition, we also introduce a simple approach to utilize the proposed overlapped speech detection model for speaker diarization which ranked third place in the Track 1 of the DIHARD III challenge. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2104.02878v1-abstract-full').style.display = 'none'; document.getElementById('2104.02878v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 April, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 2 figures, 4 tables, submitted to Interspeech as a conference paper</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2102.03207">arXiv:2102.03207</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2102.03207">pdf</a>, <a href="https://arxiv.org/format/2102.03207">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Real-time Denoising and Dereverberation with Tiny Recurrent U-Net </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Choi%2C+H">Hyeong-Seok Choi</a>, <a href="/search/eess?searchtype=author&amp;query=Park%2C+S">Sungjin Park</a>, <a href="/search/eess?searchtype=author&amp;query=Lee%2C+J+H">Jie Hwan Lee</a>, <a href="/search/eess?searchtype=author&amp;query=Heo%2C+H">Hoon Heo</a>, <a href="/search/eess?searchtype=author&amp;query=Jeon%2C+D">Dongsuk Jeon</a>, <a href="/search/eess?searchtype=author&amp;query=Lee%2C+K">Kyogu Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2102.03207v3-abstract-short" style="display: inline;"> Modern deep learning-based models have seen outstanding performance improvement with speech enhancement tasks. The number of parameters of state-of-the-art models, however, is often too large to be deployed on devices for real-world applications. To this end, we propose Tiny Recurrent U-Net (TRU-Net), a lightweight online inference model that matches the performance of current state-of-the-art mod&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2102.03207v3-abstract-full').style.display = 'inline'; document.getElementById('2102.03207v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2102.03207v3-abstract-full" style="display: none;"> Modern deep learning-based models have seen outstanding performance improvement with speech enhancement tasks. The number of parameters of state-of-the-art models, however, is often too large to be deployed on devices for real-world applications. To this end, we propose Tiny Recurrent U-Net (TRU-Net), a lightweight online inference model that matches the performance of current state-of-the-art models. The size of the quantized version of TRU-Net is 362 kilobytes, which is small enough to be deployed on edge devices. In addition, we combine the small-sized model with a new masking method called phase-aware $尾$-sigmoid mask, which enables simultaneous denoising and dereverberation. Results of both objective and subjective evaluations have shown that our model can achieve competitive performance with the current state-of-the-art models on benchmark datasets using fewer parameters by orders of magnitude. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2102.03207v3-abstract-full').style.display = 'none'; document.getElementById('2102.03207v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 June, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 February, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 2 figures, 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). arXiv admin note: text overlap with arXiv:2006.00687</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2011.14885">arXiv:2011.14885</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2011.14885">pdf</a>, <a href="https://arxiv.org/ps/2011.14885">ps</a>, <a href="https://arxiv.org/format/2011.14885">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Look who&#39;s not talking </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Kwon%2C+Y">Youngki Kwon</a>, <a href="/search/eess?searchtype=author&amp;query=Heo%2C+H+S">Hee Soo Heo</a>, <a href="/search/eess?searchtype=author&amp;query=Huh%2C+J">Jaesung Huh</a>, <a href="/search/eess?searchtype=author&amp;query=Lee%2C+B">Bong-Jin Lee</a>, <a href="/search/eess?searchtype=author&amp;query=Chung%2C+J+S">Joon Son Chung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2011.14885v1-abstract-short" style="display: inline;"> The objective of this work is speaker diarisation of speech recordings &#39;in the wild&#39;. The ability to determine speech segments is a crucial part of diarisation systems, accounting for a large proportion of errors. In this paper, we present a simple but effective solution for speech activity detection based on the speaker embeddings. In particular, we discover that the norm of the speaker embedding&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2011.14885v1-abstract-full').style.display = 'inline'; document.getElementById('2011.14885v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2011.14885v1-abstract-full" style="display: none;"> The objective of this work is speaker diarisation of speech recordings &#39;in the wild&#39;. The ability to determine speech segments is a crucial part of diarisation systems, accounting for a large proportion of errors. In this paper, we present a simple but effective solution for speech activity detection based on the speaker embeddings. In particular, we discover that the norm of the speaker embedding is an extremely effective indicator of speech activity. The method does not require an independent model for speech activity detection, therefore allows speaker diarisation to be performed using a unified representation for both speaker modelling and speech activity detection. We perform a number of experiments on in-house and public datasets, in which our method outperforms popular baselines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2011.14885v1-abstract-full').style.display = 'none'; document.getElementById('2011.14885v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 November, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">SLT 2021</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2011.02168">arXiv:2011.02168</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2011.02168">pdf</a>, <a href="https://arxiv.org/format/2011.02168">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Learning in your voice: Non-parallel voice conversion based on speaker consistency loss </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Kwon%2C+Y">Yoohwan Kwon</a>, <a href="/search/eess?searchtype=author&amp;query=Chung%2C+S">Soo-Whan Chung</a>, <a href="/search/eess?searchtype=author&amp;query=Heo%2C+H">Hee-Soo Heo</a>, <a href="/search/eess?searchtype=author&amp;query=Kang%2C+H">Hong-Goo Kang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2011.02168v1-abstract-short" style="display: inline;"> In this paper, we propose a novel voice conversion strategy to resolve the mismatch between the training and conversion scenarios when parallel speech corpus is unavailable for training. Based on auto-encoder and disentanglement frameworks, we design the proposed model to extract identity and content representations while reconstructing the input speech signal itself. Since we use other speaker&#39;s&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2011.02168v1-abstract-full').style.display = 'inline'; document.getElementById('2011.02168v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2011.02168v1-abstract-full" style="display: none;"> In this paper, we propose a novel voice conversion strategy to resolve the mismatch between the training and conversion scenarios when parallel speech corpus is unavailable for training. Based on auto-encoder and disentanglement frameworks, we design the proposed model to extract identity and content representations while reconstructing the input speech signal itself. Since we use other speaker&#39;s identity information in the training process, the training philosophy is naturally matched with the objective of voice conversion process. In addition, we effectively design the disentanglement framework to reliably preserve linguistic information and to enhance the quality of converted speech signals. The superiority of the proposed method is shown in subjective listening tests as well as objective measures. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2011.02168v1-abstract-full').style.display = 'none'; document.getElementById('2011.02168v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICASSP 2021 submitted</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2010.15809">arXiv:2010.15809</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2010.15809">pdf</a>, <a href="https://arxiv.org/format/2010.15809">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> The ins and outs of speaker recognition: lessons from VoxSRC 2020 </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Kwon%2C+Y">Yoohwan Kwon</a>, <a href="/search/eess?searchtype=author&amp;query=Heo%2C+H">Hee-Soo Heo</a>, <a href="/search/eess?searchtype=author&amp;query=Lee%2C+B">Bong-Jin Lee</a>, <a href="/search/eess?searchtype=author&amp;query=Chung%2C+J+S">Joon Son Chung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2010.15809v1-abstract-short" style="display: inline;"> The VoxCeleb Speaker Recognition Challenge (VoxSRC) at Interspeech 2020 offers a challenging evaluation for speaker recognition systems, which includes celebrities playing different parts in movies. The goal of this work is robust speaker recognition of utterances recorded in these challenging environments. We utilise variants of the popular ResNet architecture for speaker recognition and perform&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2010.15809v1-abstract-full').style.display = 'inline'; document.getElementById('2010.15809v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2010.15809v1-abstract-full" style="display: none;"> The VoxCeleb Speaker Recognition Challenge (VoxSRC) at Interspeech 2020 offers a challenging evaluation for speaker recognition systems, which includes celebrities playing different parts in movies. The goal of this work is robust speaker recognition of utterances recorded in these challenging environments. We utilise variants of the popular ResNet architecture for speaker recognition and perform extensive experiments using a range of loss functions and training parameters. To this end, we optimise an efficient training framework that allows powerful models to be trained with limited time and resources. Our trained models demonstrate improvements over most existing works with lighter models and a simple pipeline. The paper shares the lessons learned from our participation in the challenge. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2010.15809v1-abstract-full').style.display = 'none'; document.getElementById('2010.15809v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2020. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2010.11543">arXiv:2010.11543</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2010.11543">pdf</a>, <a href="https://arxiv.org/format/2010.11543">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Graph Attention Networks for Speaker Verification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Jung%2C+J">Jee-weon Jung</a>, <a href="/search/eess?searchtype=author&amp;query=Heo%2C+H">Hee-Soo Heo</a>, <a href="/search/eess?searchtype=author&amp;query=Yu%2C+H">Ha-Jin Yu</a>, <a href="/search/eess?searchtype=author&amp;query=Chung%2C+J+S">Joon Son Chung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2010.11543v2-abstract-short" style="display: inline;"> This work presents a novel back-end framework for speaker verification using graph attention networks. Segment-wise speaker embeddings extracted from multiple crops within an utterance are interpreted as node representations of a graph. The proposed framework inputs segment-wise speaker embeddings from an enrollment and a test utterance and directly outputs a similarity score. We first construct a&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2010.11543v2-abstract-full').style.display = 'inline'; document.getElementById('2010.11543v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2010.11543v2-abstract-full" style="display: none;"> This work presents a novel back-end framework for speaker verification using graph attention networks. Segment-wise speaker embeddings extracted from multiple crops within an utterance are interpreted as node representations of a graph. The proposed framework inputs segment-wise speaker embeddings from an enrollment and a test utterance and directly outputs a similarity score. We first construct a graph using segment-wise speaker embeddings and then input these to graph attention networks. After a few graph attention layers with residual connections, each node is projected into a one-dimensional space using affine transform, followed by a readout operation resulting in a scalar similarity score. To enable successful adaptation for speaker verification, we propose techniques such as separating trainable weights for attention map calculations between segment-wise speaker embeddings from different utterances. The effectiveness of the proposed framework is validated using three different speaker embedding extractors trained with different architectures and objective functions. Experimental results demonstrate consistent improvement over various baseline back-end classifiers, with an average equal error rate improvement of 20% over the cosine similarity back-end without test time augmentation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2010.11543v2-abstract-full').style.display = 'none'; document.getElementById('2010.11543v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 February, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 October, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 1 figure, 2 tables, accepted for presentation at ICASSP 2021 as a conference paper</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2009.14153">arXiv:2009.14153</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2009.14153">pdf</a>, <a href="https://arxiv.org/format/2009.14153">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Clova Baseline System for the VoxCeleb Speaker Recognition Challenge 2020 </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Heo%2C+H+S">Hee Soo Heo</a>, <a href="/search/eess?searchtype=author&amp;query=Lee%2C+B">Bong-Jin Lee</a>, <a href="/search/eess?searchtype=author&amp;query=Huh%2C+J">Jaesung Huh</a>, <a href="/search/eess?searchtype=author&amp;query=Chung%2C+J+S">Joon Son Chung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2009.14153v1-abstract-short" style="display: inline;"> This report describes our submission to the VoxCeleb Speaker Recognition Challenge (VoxSRC) at Interspeech 2020. We perform a careful analysis of speaker recognition models based on the popular ResNet architecture, and train a number of variants using a range of loss functions. Our results show significant improvements over most existing works without the use of model ensemble or post-processing.&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2009.14153v1-abstract-full').style.display = 'inline'; document.getElementById('2009.14153v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2009.14153v1-abstract-full" style="display: none;"> This report describes our submission to the VoxCeleb Speaker Recognition Challenge (VoxSRC) at Interspeech 2020. We perform a careful analysis of speaker recognition models based on the popular ResNet architecture, and train a number of variants using a range of loss functions. Our results show significant improvements over most existing works without the use of model ensemble or post-processing. We release the training code and pre-trained models as unofficial baselines for this year&#39;s challenge. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2009.14153v1-abstract-full').style.display = 'none'; document.getElementById('2009.14153v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 September, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2020. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2007.12085">arXiv:2007.12085</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2007.12085">pdf</a>, <a href="https://arxiv.org/format/2007.12085">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Augmentation adversarial training for self-supervised speaker recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Huh%2C+J">Jaesung Huh</a>, <a href="/search/eess?searchtype=author&amp;query=Heo%2C+H+S">Hee Soo Heo</a>, <a href="/search/eess?searchtype=author&amp;query=Kang%2C+J">Jingu Kang</a>, <a href="/search/eess?searchtype=author&amp;query=Watanabe%2C+S">Shinji Watanabe</a>, <a href="/search/eess?searchtype=author&amp;query=Chung%2C+J+S">Joon Son Chung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2007.12085v3-abstract-short" style="display: inline;"> The goal of this work is to train robust speaker recognition models without speaker labels. Recent works on unsupervised speaker representations are based on contrastive learning in which they encourage within-utterance embeddings to be similar and across-utterance embeddings to be dissimilar. However, since the within-utterance segments share the same acoustic characteristics, it is difficult to&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2007.12085v3-abstract-full').style.display = 'inline'; document.getElementById('2007.12085v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2007.12085v3-abstract-full" style="display: none;"> The goal of this work is to train robust speaker recognition models without speaker labels. Recent works on unsupervised speaker representations are based on contrastive learning in which they encourage within-utterance embeddings to be similar and across-utterance embeddings to be dissimilar. However, since the within-utterance segments share the same acoustic characteristics, it is difficult to separate the speaker information from the channel information. To this end, we propose augmentation adversarial training strategy that trains the network to be discriminative for the speaker information, while invariant to the augmentation applied. Since the augmentation simulates the acoustic characteristics, training the network to be invariant to augmentation also encourages the network to be invariant to the channel information in general. Extensive experiments on the VoxCeleb and VOiCES datasets show significant improvements over previous works using self-supervision, and the performance of our self-supervised models far exceed that of humans. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2007.12085v3-abstract-full').style.display = 'none'; document.getElementById('2007.12085v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 July, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Workshop on Self-Supervised Learning for Speech and Audio Processing, NeurIPS</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2006.00687">arXiv:2006.00687</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2006.00687">pdf</a>, <a href="https://arxiv.org/format/2006.00687">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Phase-aware Single-stage Speech Denoising and Dereverberation with U-Net </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Choi%2C+H">Hyeong-Seok Choi</a>, <a href="/search/eess?searchtype=author&amp;query=Heo%2C+H">Hoon Heo</a>, <a href="/search/eess?searchtype=author&amp;query=Lee%2C+J+H">Jie Hwan Lee</a>, <a href="/search/eess?searchtype=author&amp;query=Lee%2C+K">Kyogu Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2006.00687v1-abstract-short" style="display: inline;"> In this work, we tackle a denoising and dereverberation problem with a single-stage framework. Although denoising and dereverberation may be considered two separate challenging tasks, and thus, two modules are typically required for each task, we show that a single deep network can be shared to solve the two problems. To this end, we propose a new masking method called phase-aware beta-sigmoid mas&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2006.00687v1-abstract-full').style.display = 'inline'; document.getElementById('2006.00687v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2006.00687v1-abstract-full" style="display: none;"> In this work, we tackle a denoising and dereverberation problem with a single-stage framework. Although denoising and dereverberation may be considered two separate challenging tasks, and thus, two modules are typically required for each task, we show that a single deep network can be shared to solve the two problems. To this end, we propose a new masking method called phase-aware beta-sigmoid mask (PHM), which reuses the estimated magnitude values to estimate the clean phase by respecting the triangle inequality in the complex domain between three signal components such as mixture, source and the rest. Two PHMs are used to deal with direct and reverberant source, which allows controlling the proportion of reverberation in the enhanced speech at inference time. In addition, to improve the speech enhancement performance, we propose a new time-domain loss function and show a reasonable performance gain compared to MSE loss in the complex domain. Finally, to achieve a real-time inference, an optimization strategy for U-Net is proposed which significantly reduces the computational overhead up to 88.9% compared to the na茂ve version. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2006.00687v1-abstract-full').style.display = 'none'; document.getElementById('2006.00687v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 May, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 3 figures, Submitted to Interspeech2020</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2005.08776">arXiv:2005.08776</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2005.08776">pdf</a>, <a href="https://arxiv.org/format/2005.08776">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Metric Learning for Keyword Spotting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Huh%2C+J">Jaesung Huh</a>, <a href="/search/eess?searchtype=author&amp;query=Lee%2C+M">Minjae Lee</a>, <a href="/search/eess?searchtype=author&amp;query=Heo%2C+H">Heesoo Heo</a>, <a href="/search/eess?searchtype=author&amp;query=Mun%2C+S">Seongkyu Mun</a>, <a href="/search/eess?searchtype=author&amp;query=Chung%2C+J+S">Joon Son Chung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2005.08776v1-abstract-short" style="display: inline;"> The goal of this work is to train effective representations for keyword spotting via metric learning. Most existing works address keyword spotting as a closed-set classification problem, where both target and non-target keywords are predefined. Therefore, prevailing classifier-based keyword spotting systems perform poorly on non-target sounds which are unseen during the training stage, causing hig&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2005.08776v1-abstract-full').style.display = 'inline'; document.getElementById('2005.08776v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2005.08776v1-abstract-full" style="display: none;"> The goal of this work is to train effective representations for keyword spotting via metric learning. Most existing works address keyword spotting as a closed-set classification problem, where both target and non-target keywords are predefined. Therefore, prevailing classifier-based keyword spotting systems perform poorly on non-target sounds which are unseen during the training stage, causing high false alarm rates in real-world scenarios. In reality, keyword spotting is a detection problem where predefined target keywords are detected from a variety of unknown sounds. This shares many similarities to metric learning problems in that the unseen and unknown non-target sounds must be clearly differentiated from the target keywords. However, a key difference is that the target keywords are known and predefined. To this end, we propose a new method based on metric learning that maximises the distance between target and non-target keywords, but also learns per-class weights for target keywords 脿 la classification objectives. Experiments on the Google Speech Commands dataset show that our method significantly reduces false alarms to unseen non-target keywords, while maintaining the overall classification accuracy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2005.08776v1-abstract-full').style.display = 'none'; document.getElementById('2005.08776v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 May, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2020. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2005.08606">arXiv:2005.08606</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2005.08606">pdf</a>, <a href="https://arxiv.org/format/2005.08606">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> End-to-End Lip Synchronisation Based on Pattern Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Kim%2C+Y+J">You Jin Kim</a>, <a href="/search/eess?searchtype=author&amp;query=Heo%2C+H+S">Hee Soo Heo</a>, <a href="/search/eess?searchtype=author&amp;query=Chung%2C+S">Soo-Whan Chung</a>, <a href="/search/eess?searchtype=author&amp;query=Lee%2C+B">Bong-Jin Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2005.08606v2-abstract-short" style="display: inline;"> The goal of this work is to synchronise audio and video of a talking face using deep neural network models. Existing works have trained networks on proxy tasks such as cross-modal similarity learning, and then computed similarities between audio and video frames using a sliding window approach. While these methods demonstrate satisfactory performance, the networks are not trained directly on the t&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2005.08606v2-abstract-full').style.display = 'inline'; document.getElementById('2005.08606v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2005.08606v2-abstract-full" style="display: none;"> The goal of this work is to synchronise audio and video of a talking face using deep neural network models. Existing works have trained networks on proxy tasks such as cross-modal similarity learning, and then computed similarities between audio and video frames using a sliding window approach. While these methods demonstrate satisfactory performance, the networks are not trained directly on the task. To this end, we propose an end-to-end trained network that can directly predict the offset between an audio stream and the corresponding video stream. The similarity matrix between the two modalities is first computed from the features, then the inference of the offset can be considered to be a pattern recognition problem where the matrix is considered equivalent to an image. The feature extractor and the classifier are trained jointly. We demonstrate that the proposed approach outperforms the previous work by a large margin on LRS2 and LRS3 datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2005.08606v2-abstract-full').style.display = 'none'; document.getElementById('2005.08606v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 March, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 May, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">slt 2021 accepted</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2003.11982">arXiv:2003.11982</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2003.11982">pdf</a>, <a href="https://arxiv.org/ps/2003.11982">ps</a>, <a href="https://arxiv.org/format/2003.11982">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.21437/Interspeech.2020-1064">10.21437/Interspeech.2020-1064 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> In defence of metric learning for speaker recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Chung%2C+J+S">Joon Son Chung</a>, <a href="/search/eess?searchtype=author&amp;query=Huh%2C+J">Jaesung Huh</a>, <a href="/search/eess?searchtype=author&amp;query=Mun%2C+S">Seongkyu Mun</a>, <a href="/search/eess?searchtype=author&amp;query=Lee%2C+M">Minjae Lee</a>, <a href="/search/eess?searchtype=author&amp;query=Heo%2C+H+S">Hee Soo Heo</a>, <a href="/search/eess?searchtype=author&amp;query=Choe%2C+S">Soyeon Choe</a>, <a href="/search/eess?searchtype=author&amp;query=Ham%2C+C">Chiheon Ham</a>, <a href="/search/eess?searchtype=author&amp;query=Jung%2C+S">Sunghwan Jung</a>, <a href="/search/eess?searchtype=author&amp;query=Lee%2C+B">Bong-Jin Lee</a>, <a href="/search/eess?searchtype=author&amp;query=Han%2C+I">Icksang Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2003.11982v2-abstract-short" style="display: inline;"> The objective of this paper is &#39;open-set&#39; speaker recognition of unseen speakers, where ideal embeddings should be able to condense information into a compact utterance-level representation that has small intra-speaker and large inter-speaker distance. A popular belief in speaker recognition is that networks trained with classification objectives outperform metric learning methods. In this paper&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2003.11982v2-abstract-full').style.display = 'inline'; document.getElementById('2003.11982v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2003.11982v2-abstract-full" style="display: none;"> The objective of this paper is &#39;open-set&#39; speaker recognition of unseen speakers, where ideal embeddings should be able to condense information into a compact utterance-level representation that has small intra-speaker and large inter-speaker distance. A popular belief in speaker recognition is that networks trained with classification objectives outperform metric learning methods. In this paper, we present an extensive evaluation of most popular loss functions for speaker recognition on the VoxCeleb dataset. We demonstrate that the vanilla triplet loss shows competitive performance compared to classification-based losses, and those trained with our proposed metric learning objective outperform state-of-the-art methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2003.11982v2-abstract-full').style.display = 'none'; document.getElementById('2003.11982v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 April, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 March, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The code can be found at https://github.com/clovaai/voxceleb_trainer</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2001.11688">arXiv:2001.11688</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2001.11688">pdf</a>, <a href="https://arxiv.org/format/2001.11688">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> A study on the role of subsidiary information in replay attack spoofing detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Jung%2C+J">Jee-weon Jung</a>, <a href="/search/eess?searchtype=author&amp;query=Shim%2C+H">Hye-jin Shim</a>, <a href="/search/eess?searchtype=author&amp;query=Heo%2C+H">Hee-Soo Heo</a>, <a href="/search/eess?searchtype=author&amp;query=Yu%2C+H">Ha-Jin Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2001.11688v1-abstract-short" style="display: inline;"> In this study, we analyze the role of various categories of subsidiary information in conducting replay attack spoofing detection: `Room Size&#39;, `Reverberation&#39;, `Speaker-to-ASV distance, `Attacker-to-Speaker distance&#39;, and `Replay Device Quality&#39;. As a means of analyzing subsidiary information, we use two frameworks to either subtract or include a category of subsidiary information to the code ext&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2001.11688v1-abstract-full').style.display = 'inline'; document.getElementById('2001.11688v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2001.11688v1-abstract-full" style="display: none;"> In this study, we analyze the role of various categories of subsidiary information in conducting replay attack spoofing detection: `Room Size&#39;, `Reverberation&#39;, `Speaker-to-ASV distance, `Attacker-to-Speaker distance&#39;, and `Replay Device Quality&#39;. As a means of analyzing subsidiary information, we use two frameworks to either subtract or include a category of subsidiary information to the code extracted from a deep neural network. For subtraction, we utilize an adversarial process framework which makes the code orthogonal to the basis vectors of the subsidiary information. For addition, we utilize the multi-task learning framework to include subsidiary information to the code. All experiments are conducted using the ASVspoof 2019 physical access scenario with the provided meta data. Through the analysis of the result of the two approaches, we conclude that various categories of subsidiary information does not reside enough in the code when the deep neural network is trained for binary classification. Explicitly including various categories of subsidiary information through the multi-task learning framework can help improve performance in closed set condition. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2001.11688v1-abstract-full').style.display = 'none'; document.getElementById('2001.11688v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 January, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2020. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1910.09778">arXiv:1910.09778</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1910.09778">pdf</a>, <a href="https://arxiv.org/format/1910.09778">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Self-supervised pre-training with acoustic configurations for replay spoofing detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Shim%2C+H">Hye-jin Shim</a>, <a href="/search/eess?searchtype=author&amp;query=Heo%2C+H">Hee-Soo Heo</a>, <a href="/search/eess?searchtype=author&amp;query=Jung%2C+J">Jee-weon Jung</a>, <a href="/search/eess?searchtype=author&amp;query=Yu%2C+H">Ha-Jin Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1910.09778v2-abstract-short" style="display: inline;"> Constructing a dataset for replay spoofing detection requires a physical process of playing an utterance and re-recording it, presenting a challenge to the collection of large-scale datasets. In this study, we propose a self-supervised framework for pretraining acoustic configurations using datasets published for other tasks, such as speaker verification. Here, acoustic configurations refer to the&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1910.09778v2-abstract-full').style.display = 'inline'; document.getElementById('1910.09778v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1910.09778v2-abstract-full" style="display: none;"> Constructing a dataset for replay spoofing detection requires a physical process of playing an utterance and re-recording it, presenting a challenge to the collection of large-scale datasets. In this study, we propose a self-supervised framework for pretraining acoustic configurations using datasets published for other tasks, such as speaker verification. Here, acoustic configurations refer to the environmental factors generated during the process of voice recording but not the voice itself, including microphone types, place and ambient noise levels. Specifically, we select pairs of segments from utterances and train deep neural networks to determine whether the acoustic configurations of the two segments are identical. We validate the effectiveness of the proposed method based on the ASVspoof 2019 physical access dataset utilizing two well-performing systems. The experimental results demonstrate that the proposed method outperforms the baseline approach by 30%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1910.09778v2-abstract-full').style.display = 'none'; document.getElementById('1910.09778v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 August, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 October, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2019. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1907.00542">arXiv:1907.00542</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1907.00542">pdf</a>, <a href="https://arxiv.org/format/1907.00542">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Cosine similarity-based adversarial process </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Heo%2C+H">Hee-Soo Heo</a>, <a href="/search/eess?searchtype=author&amp;query=Jung%2C+J">Jee-weon Jung</a>, <a href="/search/eess?searchtype=author&amp;query=Shim%2C+H">Hye-jin Shim</a>, <a href="/search/eess?searchtype=author&amp;query=Yang%2C+I">IL-Ho Yang</a>, <a href="/search/eess?searchtype=author&amp;query=Yu%2C+H">Ha-Jin Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1907.00542v1-abstract-short" style="display: inline;"> An adversarial process between two deep neural networks is a promising approach to train a robust model. In this paper, we propose an adversarial process using cosine similarity, whereas conventional adversarial processes are based on inverted categorical cross entropy (CCE). When used for training an identification model, the adversarial process induces the competition of two discriminative model&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1907.00542v1-abstract-full').style.display = 'inline'; document.getElementById('1907.00542v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1907.00542v1-abstract-full" style="display: none;"> An adversarial process between two deep neural networks is a promising approach to train a robust model. In this paper, we propose an adversarial process using cosine similarity, whereas conventional adversarial processes are based on inverted categorical cross entropy (CCE). When used for training an identification model, the adversarial process induces the competition of two discriminative models; one for a primary task such as speaker identification or image recognition, the other one for a subsidiary task such as channel identification or domain identification. In particular, the adversarial process degrades the performance of the subsidiary model by eliminating the subsidiary information in the input which, in assumption, may degrade the performance of the primary model. The conventional adversarial processes maximize the CCE of the subsidiary model to degrade the performance. We have studied a framework for training robust discriminative models by eliminating channel or domain information (subsidiary information) by applying such an adversarial process. However, we found through experiments that using the process of maximizing the CCE does not guarantee the performance degradation of the subsidiary model. In the proposed adversarial process using cosine similarity, on the contrary, the performance of the subsidiary model can be degraded more efficiently by searching feature space orthogonal to the subsidiary model. The experiments on speaker identification and image recognition show that we found features that make the outputs of the subsidiary models independent of the input, and the performances of the primary models are improved. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1907.00542v1-abstract-full').style.display = 'none'; document.getElementById('1907.00542v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 July, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1904.10135">arXiv:1904.10135</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1904.10135">pdf</a>, <a href="https://arxiv.org/format/1904.10135">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Acoustic scene classification using teacher-student learning with soft-labels </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Heo%2C+H">Hee-Soo Heo</a>, <a href="/search/eess?searchtype=author&amp;query=Jung%2C+J">Jee-weon Jung</a>, <a href="/search/eess?searchtype=author&amp;query=Shim%2C+H">Hye-jin Shim</a>, <a href="/search/eess?searchtype=author&amp;query=Yu%2C+H">Ha-Jin Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1904.10135v2-abstract-short" style="display: inline;"> Acoustic scene classification identifies an input segment into one of the pre-defined classes using spectral information. The spectral information of acoustic scenes may not be mutually exclusive due to common acoustic properties across different classes, such as babble noises included in both airports and shopping malls. However, conventional training procedure based on one-hot labels does not co&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1904.10135v2-abstract-full').style.display = 'inline'; document.getElementById('1904.10135v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1904.10135v2-abstract-full" style="display: none;"> Acoustic scene classification identifies an input segment into one of the pre-defined classes using spectral information. The spectral information of acoustic scenes may not be mutually exclusive due to common acoustic properties across different classes, such as babble noises included in both airports and shopping malls. However, conventional training procedure based on one-hot labels does not consider the similarities between different acoustic scenes. We exploit teacher-student learning with the purpose to derive soft-labels that consider common acoustic properties among different acoustic scenes. In teacher-student learning, the teacher network produces soft-labels, based on which the student network is trained. We investigate various methods to extract soft-labels that better represent similarities across different scenes. Such attempts include extracting soft-labels from multiple audio segments that are defined as an identical acoustic scene. Experimental results demonstrate the potential of our approach, showing a classification accuracy of 77.36 % on the DCASE 2018 task 1 validation set. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1904.10135v2-abstract-full').style.display = 'none'; document.getElementById('1904.10135v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 July, 2019; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 April, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted for presentation at Interspeech 2019</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1904.10134">arXiv:1904.10134</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1904.10134">pdf</a>, <a href="https://arxiv.org/format/1904.10134">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Replay attack detection with complementary high-resolution information using end-to-end DNN for the ASVspoof 2019 Challenge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Jung%2C+J">Jee-weon Jung</a>, <a href="/search/eess?searchtype=author&amp;query=Shim%2C+H">Hye-jin Shim</a>, <a href="/search/eess?searchtype=author&amp;query=Heo%2C+H">Hee-Soo Heo</a>, <a href="/search/eess?searchtype=author&amp;query=Yu%2C+H">Ha-Jin Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1904.10134v2-abstract-short" style="display: inline;"> In this study, we concentrate on replacing the process of extracting hand-crafted acoustic feature with end-to-end DNN using complementary high-resolution spectrograms. As a result of advance in audio devices, typical characteristics of a replayed speech based on conventional knowledge alter or diminish in unknown replay configurations. Thus, it has become increasingly difficult to detect spoofed&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1904.10134v2-abstract-full').style.display = 'inline'; document.getElementById('1904.10134v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1904.10134v2-abstract-full" style="display: none;"> In this study, we concentrate on replacing the process of extracting hand-crafted acoustic feature with end-to-end DNN using complementary high-resolution spectrograms. As a result of advance in audio devices, typical characteristics of a replayed speech based on conventional knowledge alter or diminish in unknown replay configurations. Thus, it has become increasingly difficult to detect spoofed speech with a conventional knowledge-based approach. To detect unrevealed characteristics that reside in a replayed speech, we directly input spectrograms into an end-to-end DNN without knowledge-based intervention. Explorations dealt in this study that differentiates from existing spectrogram-based systems are twofold: complementary information and high-resolution. Spectrograms with different information are explored, and it is shown that additional information such as the phase information can be complementary. High-resolution spectrograms are employed with the assumption that the difference between a bona-fide and a replayed speech exists in the details. Additionally, to verify whether other features are complementary to spectrograms, we also examine raw waveform and an i-vector based system. Experiments conducted on the ASVspoof 2019 physical access challenge show promising results, where t-DCF and equal error rates are 0.0570 and 2.45 % for the evaluation set, respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1904.10134v2-abstract-full').style.display = 'none'; document.getElementById('1904.10134v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 July, 2019; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 April, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted for oral presentation at Interspeech 2019, code available at https://github.com/Jungjee/ASVspoof2019_PA</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1904.08104">arXiv:1904.08104</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1904.08104">pdf</a>, <a href="https://arxiv.org/ps/1904.08104">ps</a>, <a href="https://arxiv.org/format/1904.08104">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> RawNet: Advanced end-to-end deep neural network using raw waveforms for text-independent speaker verification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Jung%2C+J">Jee-weon Jung</a>, <a href="/search/eess?searchtype=author&amp;query=Heo%2C+H">Hee-Soo Heo</a>, <a href="/search/eess?searchtype=author&amp;query=Kim%2C+J">Ju-ho Kim</a>, <a href="/search/eess?searchtype=author&amp;query=Shim%2C+H">Hye-jin Shim</a>, <a href="/search/eess?searchtype=author&amp;query=Yu%2C+H">Ha-Jin Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1904.08104v2-abstract-short" style="display: inline;"> Recently, direct modeling of raw waveforms using deep neural networks has been widely studied for a number of tasks in audio domains. In speaker verification, however, utilization of raw waveforms is in its preliminary phase, requiring further investigation. In this study, we explore end-to-end deep neural networks that input raw waveforms to improve various aspects: front-end speaker embedding ex&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1904.08104v2-abstract-full').style.display = 'inline'; document.getElementById('1904.08104v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1904.08104v2-abstract-full" style="display: none;"> Recently, direct modeling of raw waveforms using deep neural networks has been widely studied for a number of tasks in audio domains. In speaker verification, however, utilization of raw waveforms is in its preliminary phase, requiring further investigation. In this study, we explore end-to-end deep neural networks that input raw waveforms to improve various aspects: front-end speaker embedding extraction including model architecture, pre-training scheme, additional objective functions, and back-end classification. Adjustment of model architecture using a pre-training scheme can extract speaker embeddings, giving a significant improvement in performance. Additional objective functions simplify the process of extracting speaker embeddings by merging conventional two-phase processes: extracting utterance-level features such as i-vectors or x-vectors and the feature enhancement phase, e.g., linear discriminant analysis. Effective back-end classification models that suit the proposed speaker embedding are also explored. We propose an end-to-end system that comprises two deep neural networks, one front-end for utterance-level speaker embedding extraction and the other for back-end classification. Experiments conducted on the VoxCeleb1 dataset demonstrate that the proposed model achieves state-of-the-art performance among systems without data augmentation. The proposed system is also comparable to the state-of-the-art x-vector system that adopts data augmentation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1904.08104v2-abstract-full').style.display = 'none'; document.getElementById('1904.08104v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 July, 2019; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 April, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted for oral presentation at Interspeech 2019, code available at http://github.com/Jungjee/RawNet</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1902.02455">arXiv:1902.02455</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1902.02455">pdf</a>, <a href="https://arxiv.org/format/1902.02455">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> End-to-end losses based on speaker basis vectors and all-speaker hard negative mining for speaker verification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Heo%2C+H">Hee-Soo Heo</a>, <a href="/search/eess?searchtype=author&amp;query=Jung%2C+J">Jee-weon Jung</a>, <a href="/search/eess?searchtype=author&amp;query=Yang%2C+I">IL-Ho Yang</a>, <a href="/search/eess?searchtype=author&amp;query=Yoon%2C+S">Sung-Hyun Yoon</a>, <a href="/search/eess?searchtype=author&amp;query=Shim%2C+H">Hye-jin Shim</a>, <a href="/search/eess?searchtype=author&amp;query=Yu%2C+H">Ha-Jin Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1902.02455v3-abstract-short" style="display: inline;"> In recent years, speaker verification has primarily performed using deep neural networks that are trained to output embeddings from input features such as spectrograms or Mel-filterbank energies. Studies that design various loss functions, including metric learning have been widely explored. In this study, we propose two end-to-end loss functions for speaker verification using the concept of speak&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1902.02455v3-abstract-full').style.display = 'inline'; document.getElementById('1902.02455v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1902.02455v3-abstract-full" style="display: none;"> In recent years, speaker verification has primarily performed using deep neural networks that are trained to output embeddings from input features such as spectrograms or Mel-filterbank energies. Studies that design various loss functions, including metric learning have been widely explored. In this study, we propose two end-to-end loss functions for speaker verification using the concept of speaker bases, which are trainable parameters. One loss function is designed to further increase the inter-speaker variation, and the other is designed to conduct the identical concept with hard negative mining. Each speaker basis is designed to represent the corresponding speaker in the process of training deep neural networks. In contrast to the conventional loss functions that can consider only a limited number of speakers included in a mini-batch, the proposed loss functions can consider all the speakers in the training set regardless of the mini-batch composition. In particular, the proposed loss functions enable hard negative mining and calculations of between-speaker variations with consideration of all speakers. Through experiments on VoxCeleb1 and VoxCeleb2 datasets, we confirmed that the proposed loss functions could supplement conventional softmax and center loss functions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1902.02455v3-abstract-full').style.display = 'none'; document.getElementById('1902.02455v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 July, 2019; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 February, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages and 2 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1810.10884">arXiv:1810.10884</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1810.10884">pdf</a>, <a href="https://arxiv.org/format/1810.10884">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Short utterance compensation in speaker verification via cosine-based teacher-student learning of speaker embeddings </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Jung%2C+J">Jee-weon Jung</a>, <a href="/search/eess?searchtype=author&amp;query=Heo%2C+H">Hee-soo Heo</a>, <a href="/search/eess?searchtype=author&amp;query=Shim%2C+H">Hye-jin Shim</a>, <a href="/search/eess?searchtype=author&amp;query=Yu%2C+H">Ha-jin Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1810.10884v2-abstract-short" style="display: inline;"> The short duration of an input utterance is one of the most critical threats that degrade the performance of speaker verification systems. This study aimed to develop an integrated text-independent speaker verification system that inputs utterances with short duration of 2 seconds or less. We propose an approach using a teacher-student learning framework for this goal, applied to short utterance c&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1810.10884v2-abstract-full').style.display = 'inline'; document.getElementById('1810.10884v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1810.10884v2-abstract-full" style="display: none;"> The short duration of an input utterance is one of the most critical threats that degrade the performance of speaker verification systems. This study aimed to develop an integrated text-independent speaker verification system that inputs utterances with short duration of 2 seconds or less. We propose an approach using a teacher-student learning framework for this goal, applied to short utterance compensation for the first time in our knowledge. The core concept of the proposed system is to conduct the compensation throughout the network that extracts the speaker embedding, mainly in phonetic-level, rather than compensating via a separate system after extracting the speaker embedding. In the proposed architecture, phonetic-level features where each feature represents a segment of 130 ms are extracted using convolutional layers. A layer of gated recurrent units extracts an utterance-level feature using phonetic-level features. The proposed approach also adopts a new objective function for teacher-student learning that considers both Kullback-Leibler divergence of output layers and cosine distance of speaker embeddings layers. Experiments were conducted using deep neural networks that take raw waveforms as input, and output speaker embeddings on VoxCeleb1 dataset. The proposed model could compensate approximately 65 \% of the performance degradation due to the shortened duration. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1810.10884v2-abstract-full').style.display = 'none'; document.getElementById('1810.10884v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 April, 2019; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 October, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2018. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 2 figures, submitted to Interspeech 2019 as a conference paper</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1808.09638">arXiv:1808.09638</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1808.09638">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Replay spoofing detection system for automatic speaker verification using multi-task learning of noise classes </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Shim%2C+H">Hye-Jin Shim</a>, <a href="/search/eess?searchtype=author&amp;query=Jung%2C+J">Jee-weon Jung</a>, <a href="/search/eess?searchtype=author&amp;query=Heo%2C+H">Hee-Soo Heo</a>, <a href="/search/eess?searchtype=author&amp;query=Yoon%2C+S">Sunghyun Yoon</a>, <a href="/search/eess?searchtype=author&amp;query=Yu%2C+H">Ha-Jin Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1808.09638v4-abstract-short" style="display: inline;"> In this paper, we propose a replay attack spoofing detection system for automatic speaker verification using multitask learning of noise classes. We define the noise that is caused by the replay attack as replay noise. We explore the effectiveness of training a deep neural network simultaneously for replay attack spoofing detection and replay noise classification. The multi-task learning includes&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1808.09638v4-abstract-full').style.display = 'inline'; document.getElementById('1808.09638v4-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1808.09638v4-abstract-full" style="display: none;"> In this paper, we propose a replay attack spoofing detection system for automatic speaker verification using multitask learning of noise classes. We define the noise that is caused by the replay attack as replay noise. We explore the effectiveness of training a deep neural network simultaneously for replay attack spoofing detection and replay noise classification. The multi-task learning includes classifying the noise of playback devices, recording environments, and recording devices as well as the spoofing detection. Each of the three types of the noise classes also includes a genuine class. The experiment results on the ASVspoof2017 datasets demonstrate that the performance of our proposed system is improved by 30% relatively on the evaluation set. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1808.09638v4-abstract-full').style.display = 'none'; document.getElementById('1808.09638v4-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 October, 2018; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 August, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2018. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, accepted by Technologies and Applications of Artificial Intelligence(TAAI)</span> </p> </li> </ol> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

Pages: 1 2 3 4 5 6 7 8 9 10