CINXE.COM

Audio and Speech Processing Sep 2024

<!DOCTYPE html> <html lang="en"> <head> <title>Audio and Speech Processing Sep 2024</title> <meta name="viewport" content="width=device-width, initial-scale=1"> <link rel="apple-touch-icon" sizes="180x180" href="/static/browse/0.3.4/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="/static/browse/0.3.4/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="/static/browse/0.3.4/images/icons/favicon-16x16.png"> <link rel="manifest" href="/static/browse/0.3.4/images/icons/site.webmanifest"> <link rel="mask-icon" href="/static/browse/0.3.4/images/icons/safari-pinned-tab.svg" color="#5bbad5"> <meta name="msapplication-TileColor" content="#da532c"> <meta name="theme-color" content="#ffffff"> <link rel="stylesheet" type="text/css" media="screen" href="/static/browse/0.3.4/css/arXiv.css?v=20241206" /> <link rel="stylesheet" type="text/css" media="print" href="/static/browse/0.3.4/css/arXiv-print.css?v=20200611" /> <link rel="stylesheet" type="text/css" media="screen" href="/static/browse/0.3.4/css/browse_search.css" /> <script language="javascript" src="/static/browse/0.3.4/js/accordion.js" /></script> <script src="/static/browse/0.3.4/js/mathjaxToggle.min.js" type="text/javascript"></script> <script type="text/javascript" language="javascript">mathjaxToggle();</script> </head> <body class="with-cu-identity"> <div class="flex-wrap-footer"> <header> <a href="#content" class="is-sr-only">Skip to main content</a> <!-- start desktop header --> <div class="columns is-vcentered is-hidden-mobile" id="cu-identity"> <div class="column" id="cu-logo"> <a href="https://www.cornell.edu/"><img src="/static/browse/0.3.4/images/icons/cu/cornell-reduced-white-SMALL.svg" alt="Cornell University" /></a> </div><div class="column" id="support-ack"> <span id="support-ack-url">We gratefully acknowledge support from the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors.</span> <a href="https://info.arxiv.org/about/donate.html" class="btn-header-donate">Donate</a> </div> </div> <div id="header" class="is-hidden-mobile"> <a aria-hidden="true" tabindex="-1" href="/IgnoreMe"></a> <div class="header-breadcrumbs"> <a href="/"><img src="/static/browse/0.3.4/images/arxiv-logo-one-color-white.svg" alt="arxiv logo" style="height:40px;"/></a> <span>&gt;</span> <a href="/list/eess.AS/recent">eess.AS</a> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div><!-- /end desktop header --> <div class="mobile-header"> <div class="columns is-mobile"> <div class="column logo-arxiv"><a href="https://arxiv.org/"><img src="/static/browse/0.3.4/images/arxiv-logomark-small-white.svg" alt="arXiv logo" style="height:60px;" /></a></div> <div class="column logo-cornell"><a href="https://www.cornell.edu/"> <picture> <source media="(min-width: 501px)" srcset="/static/browse/0.3.4/images/icons/cu/cornell-reduced-white-SMALL.svg 400w" sizes="400w" /> <source srcset="/static/browse/0.3.4/images/icons/cu/cornell_seal_simple_black.svg 2x" /> <img src="/static/browse/0.3.4/images/icons/cu/cornell-reduced-white-SMALL.svg" alt="Cornell University Logo" /> </picture> </a></div> <div class="column nav" id="toggle-container" role="menubar"> <button class="toggle-control"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-white"><title>open search</title><path d="M505 442.7L405.3 343c-4.5-4.5-10.6-7-17-7H372c27.6-35.3 44-79.7 44-128C416 93.1 322.9 0 208 0S0 93.1 0 208s93.1 208 208 208c48.3 0 92.7-16.4 128-44v16.3c0 6.4 2.5 12.5 7 17l99.7 99.7c9.4 9.4 24.6 9.4 33.9 0l28.3-28.3c9.4-9.4 9.4-24.6.1-34zM208 336c-70.7 0-128-57.2-128-128 0-70.7 57.2-128 128-128 70.7 0 128 57.2 128 128 0 70.7-57.2 128-128 128z"/></svg></button> <div class="mobile-toggle-block toggle-target"> <form class="mobile-search-form" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <input class="input" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <input type="hidden" name="source" value="header"> <input type="hidden" name="searchtype" value="all"> <button class="button">GO</button> </div> </form> </div> <button class="toggle-control"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-white" role="menu"><title>open navigation menu</title><path d="M16 132h416c8.837 0 16-7.163 16-16V76c0-8.837-7.163-16-16-16H16C7.163 60 0 67.163 0 76v40c0 8.837 7.163 16 16 16zm0 160h416c8.837 0 16-7.163 16-16v-40c0-8.837-7.163-16-16-16H16c-8.837 0-16 7.163-16 16v40c0 8.837 7.163 16 16 16zm0 160h416c8.837 0 16-7.163 16-16v-40c0-8.837-7.163-16-16-16H16c-8.837 0-16 7.163-16 16v40c0 8.837 7.163 16 16 16z"/ ></svg></button> <div class="mobile-toggle-block toggle-target"> <nav class="mobile-menu" aria-labelledby="mobilemenulabel"> <h2 id="mobilemenulabel">quick links</h2> <ul> <li><a href="https://arxiv.org/login">Login</a></li> <li><a href="https://info.arxiv.org/help">Help Pages</a></li> <li><a href="https://info.arxiv.org/about">About</a></li> </ul> </nav> </div> </div> </div> </div><!-- /end mobile-header --> </header> <main> <div id="content"> <div id='content-inner'> <div id='dlpage'> <h1>Audio and Speech Processing</h1> <h2>Authors and titles for September 2024 </h2> <div class='paging'>Total of 541 entries : <span>1-50</span> <a href=/list/eess.AS/2024-09?skip=50&amp;show=50>51-100</a> <a href=/list/eess.AS/2024-09?skip=100&amp;show=50>101-150</a> <a href=/list/eess.AS/2024-09?skip=150&amp;show=50>151-200</a> <span>...</span> <a href=/list/eess.AS/2024-09?skip=500&amp;show=50>501-541</a> </div> <div class='morefewer'>Showing up to 50 entries per page: <a href=/list/eess.AS/2024-09?skip=0&amp;show=25 rel="nofollow"> fewer</a> | <a href=/list/eess.AS/2024-09?skip=0&amp;show=100 rel="nofollow"> more</a> | <a href=/list/eess.AS/2024-09?skip=0&amp;show=2000 rel="nofollow"> all</a> </div> <dl id='articles'> <dt> <a name='item1'>[1]</a> <a href ="/abs/2409.00387" title="Abstract" id="2409.00387"> arXiv:2409.00387 </a> [<a href="/pdf/2409.00387" title="Download PDF" id="pdf-2409.00387" aria-labelledby="pdf-2409.00387">pdf</a>, <a href="https://arxiv.org/html/2409.00387v1" title="View HTML" id="html-2409.00387" aria-labelledby="html-2409.00387" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2409.00387" title="Other formats" id="oth-2409.00387" aria-labelledby="oth-2409.00387">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Progressive Residual Extraction based Pre-training for Speech Representation Learning </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Wang,+T">Tianrui Wang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Li,+J">Jin Li</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Ma,+Z">Ziyang Ma</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Cao,+R">Rui Cao</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Chen,+X">Xie Chen</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Wang,+L">Longbiao Wang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Ge,+M">Meng Ge</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Wang,+X">Xiaobao Wang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Wang,+Y">Yuguang Wang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Dang,+J">Jianwu Dang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Tashi,+N">Nyima Tashi</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span>; Sound (cs.SD) </div> </div> </dd> <dt> <a name='item2'>[2]</a> <a href ="/abs/2409.00481" title="Abstract" id="2409.00481"> arXiv:2409.00481 </a> [<a href="/pdf/2409.00481" title="Download PDF" id="pdf-2409.00481" aria-labelledby="pdf-2409.00481">pdf</a>, <a href="https://arxiv.org/html/2409.00481v5" title="View HTML" id="html-2409.00481" aria-labelledby="html-2409.00481" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2409.00481" title="Other formats" id="oth-2409.00481" aria-labelledby="oth-2409.00481">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> DCIM-AVSR : Efficient Audio-Visual Speech Recognition via Dual Conformer Interaction Module </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Wang,+X">Xinyu Wang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Jiang,+H">Haotian Jiang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Huang,+H">Haolin Huang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Fang,+Y">Yu Fang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Xu,+M">Mengjie Xu</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Wang,+Q">Qian Wang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted to ICASSP 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span>; Sound (cs.SD) </div> </div> </dd> <dt> <a name='item3'>[3]</a> <a href ="/abs/2409.00552" title="Abstract" id="2409.00552"> arXiv:2409.00552 </a> [<a href="/pdf/2409.00552" title="Download PDF" id="pdf-2409.00552" aria-labelledby="pdf-2409.00552">pdf</a>, <a href="https://arxiv.org/html/2409.00552v1" title="View HTML" id="html-2409.00552" aria-labelledby="html-2409.00552" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2409.00552" title="Other formats" id="oth-2409.00552" aria-labelledby="oth-2409.00552">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Digit Recognition using Multimodal Spiking Neural Networks </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Bjorndahl,+W">William Bjorndahl</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Easton,+J">Jack Easton</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Modoff,+A">Austin Modoff</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Larson,+E+C">Eric C. Larson</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Camp,+J">Joseph Camp</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Rangarajan,+P">Prasanna Rangarajan</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 4 pages, 2 figures, submitted to 2025 IEEE International Conference on Acoustics, Speech, and Signal Processing </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span>; Computer Vision and Pattern Recognition (cs.CV); Multimedia (cs.MM); Sound (cs.SD) </div> </div> </dd> <dt> <a name='item4'>[4]</a> <a href ="/abs/2409.00562" title="Abstract" id="2409.00562"> arXiv:2409.00562 </a> [<a href="/pdf/2409.00562" title="Download PDF" id="pdf-2409.00562" aria-labelledby="pdf-2409.00562">pdf</a>, <a href="https://arxiv.org/html/2409.00562v2" title="View HTML" id="html-2409.00562" aria-labelledby="html-2409.00562" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2409.00562" title="Other formats" id="oth-2409.00562" aria-labelledby="oth-2409.00562">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Comparative Analysis of Modality Fusion Approaches for Audio-Visual Person Identification and Verification </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Farhadipour,+A">Aref Farhadipour</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Chapariniya,+M">Masoumeh Chapariniya</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Vukovic,+T">Teodora Vukovic</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Dellwo,+V">Volker Dellwo</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> This paper was accepted at the ICNLSP2024 conference </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span>; Computer Vision and Pattern Recognition (cs.CV); Multimedia (cs.MM); Sound (cs.SD) </div> </div> </dd> <dt> <a name='item5'>[5]</a> <a href ="/abs/2409.01160" title="Abstract" id="2409.01160"> arXiv:2409.01160 </a> [<a href="/pdf/2409.01160" title="Download PDF" id="pdf-2409.01160" aria-labelledby="pdf-2409.01160">pdf</a>, <a href="https://arxiv.org/html/2409.01160v1" title="View HTML" id="html-2409.01160" aria-labelledby="html-2409.01160" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2409.01160" title="Other formats" id="oth-2409.01160" aria-labelledby="oth-2409.01160">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Expanding on EnCLAP with Auxiliary Retrieval Model for Automated Audio Captioning </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Kim,+J">Jaeyeon Kim</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Jung,+J">Jaeyoon Jung</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Jeon,+M">Minjeong Jeon</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Woo,+S+H">Sang Hoon Woo</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Lee,+J">Jinjoo Lee</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> DCASE2024 Challenge Technical Report. Ranked 2nd in Task 6 Automated Audio Captioning </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span>; Artificial Intelligence (cs.AI); Sound (cs.SD) </div> </div> </dd> <dt> <a name='item6'>[6]</a> <a href ="/abs/2409.01201" title="Abstract" id="2409.01201"> arXiv:2409.01201 </a> [<a href="/pdf/2409.01201" title="Download PDF" id="pdf-2409.01201" aria-labelledby="pdf-2409.01201">pdf</a>, <a href="https://arxiv.org/html/2409.01201v1" title="View HTML" id="html-2409.01201" aria-labelledby="html-2409.01201" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2409.01201" title="Other formats" id="oth-2409.01201" aria-labelledby="oth-2409.01201">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> EnCLAP++: Analyzing the EnCLAP Framework for Optimizing Automated Audio Captioning Performance </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Kim,+J">Jaeyeon Kim</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Jeon,+M">Minjeon Jeon</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Jung,+J">Jaeyoon Jung</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Woo,+S+H">Sang Hoon Woo</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Lee,+J">Jinjoo Lee</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted to DCASE2024 Workshop </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span>; Artificial Intelligence (cs.AI); Sound (cs.SD) </div> </div> </dd> <dt> <a name='item7'>[7]</a> <a href ="/abs/2409.01209" title="Abstract" id="2409.01209"> arXiv:2409.01209 </a> [<a href="/pdf/2409.01209" title="Download PDF" id="pdf-2409.01209" aria-labelledby="pdf-2409.01209">pdf</a>, <a href="https://arxiv.org/html/2409.01209v1" title="View HTML" id="html-2409.01209" aria-labelledby="html-2409.01209" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2409.01209" title="Other formats" id="oth-2409.01209" aria-labelledby="oth-2409.01209">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Suppressing Noise Disparity in Training Data for Automatic Pathological Speech Detection </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Amiri,+M">Mahdi Amiri</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Kodrasi,+I">Ina Kodrasi</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> To appear in IWAENC 2024 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span>; Sound (cs.SD) </div> </div> </dd> <dt> <a name='item8'>[8]</a> <a href ="/abs/2409.01438" title="Abstract" id="2409.01438"> arXiv:2409.01438 </a> [<a href="/pdf/2409.01438" title="Download PDF" id="pdf-2409.01438" aria-labelledby="pdf-2409.01438">pdf</a>, <a href="https://arxiv.org/html/2409.01438v2" title="View HTML" id="html-2409.01438" aria-labelledby="html-2409.01438" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2409.01438" title="Other formats" id="oth-2409.01438" aria-labelledby="oth-2409.01438">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Resource-Efficient Adaptation of Speech Foundation Models for Multi-Speaker ASR </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Wang,+W">Weiqing Wang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Dhawan,+K">Kunal Dhawan</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Park,+T">Taejin Park</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Puvvada,+K+C">Krishna C. Puvvada</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Medennikov,+I">Ivan Medennikov</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Majumdar,+S">Somshubra Majumdar</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Huang,+H">He Huang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Balam,+J">Jagadeesh Balam</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Ginsburg,+B">Boris Ginsburg</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted by SLT 2024 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span>; Sound (cs.SD) </div> </div> </dd> <dt> <a name='item9'>[9]</a> <a href ="/abs/2409.01776" title="Abstract" id="2409.01776"> arXiv:2409.01776 </a> [<a href="/pdf/2409.01776" title="Download PDF" id="pdf-2409.01776" aria-labelledby="pdf-2409.01776">pdf</a>, <a href="https://arxiv.org/html/2409.01776v1" title="View HTML" id="html-2409.01776" aria-labelledby="html-2409.01776" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2409.01776" title="Other formats" id="oth-2409.01776" aria-labelledby="oth-2409.01776">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Steered Response Power-Based Direction-of-Arrival Estimation Exploiting an Auxiliary Microphone </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Br%C3%BCmann,+K">Klaus Br眉mann</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Doclo,+S">Simon Doclo</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 5 pages, 3 figures, conference: EUSIPCO 2024 in Lyon </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span>; Sound (cs.SD); Signal Processing (eess.SP) </div> </div> </dd> <dt> <a name='item10'>[10]</a> <a href ="/abs/2409.01813" title="Abstract" id="2409.01813"> arXiv:2409.01813 </a> [<a href="/pdf/2409.01813" title="Download PDF" id="pdf-2409.01813" aria-labelledby="pdf-2409.01813">pdf</a>, <a href="https://arxiv.org/html/2409.01813v3" title="View HTML" id="html-2409.01813" aria-labelledby="html-2409.01813" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2409.01813" title="Other formats" id="oth-2409.01813" aria-labelledby="oth-2409.01813">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Reassessing Noise Augmentation Methods in the Context of Adversarial Speech </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Pizzi,+K">Karla Pizzi</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Pizarro,+M">Mat铆as Pizarro</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Fischer,+A">Asja Fischer</a></div> <div class='list-journal-ref'><span class='descriptor'>Journal-ref:</span> Proc. 4th Symposium on Security and Privacy in Speech Communication, 26-32, 2024 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span>; Machine Learning (cs.LG); Sound (cs.SD) </div> </div> </dd> <dt> <a name='item11'>[11]</a> <a href ="/abs/2409.01995" title="Abstract" id="2409.01995"> arXiv:2409.01995 </a> [<a href="/pdf/2409.01995" title="Download PDF" id="pdf-2409.01995" aria-labelledby="pdf-2409.01995">pdf</a>, <a href="https://arxiv.org/html/2409.01995v3" title="View HTML" id="html-2409.01995" aria-labelledby="html-2409.01995" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2409.01995" title="Other formats" id="oth-2409.01995" aria-labelledby="oth-2409.01995">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> vec2wav 2.0: Advancing Voice Conversion via Discrete Token Vocoders </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Guo,+Y">Yiwei Guo</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Li,+Z">Zhihan Li</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Li,+J">Junjie Li</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Du,+C">Chenpeng Du</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Wang,+H">Hankun Wang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Wang,+S">Shuai Wang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Chen,+X">Xie Chen</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Yu,+K">Kai Yu</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 5 pages, 4 figures. Demo page: <a href="https://cantabile-kwok.github.io/vec2wav2/" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span>; Artificial Intelligence (cs.AI); Sound (cs.SD) </div> </div> </dd> <dt> <a name='item12'>[12]</a> <a href ="/abs/2409.02041" title="Abstract" id="2409.02041"> arXiv:2409.02041 </a> [<a href="/pdf/2409.02041" title="Download PDF" id="pdf-2409.02041" aria-labelledby="pdf-2409.02041">pdf</a>, <a href="https://arxiv.org/html/2409.02041v2" title="View HTML" id="html-2409.02041" aria-labelledby="html-2409.02041" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2409.02041" title="Other formats" id="oth-2409.02041" aria-labelledby="oth-2409.02041">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> The USTC-NERCSLIP Systems for the CHiME-8 NOTSOFAR-1 Challenge </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Niu,+S">Shutong Niu</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Wang,+R">Ruoyu Wang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Du,+J">Jun Du</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Yang,+G">Gaobin Yang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Tu,+Y">Yanhui Tu</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Wu,+S">Siyuan Wu</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Qian,+S">Shuangqing Qian</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Wu,+H">Huaxin Wu</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Xu,+H">Haitao Xu</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Zhang,+X">Xueyang Zhang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Zhong,+G">Guolong Zhong</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Yu,+X">Xindi Yu</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Chen,+J">Jieru Chen</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Wang,+M">Mengzhi Wang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Cai,+D">Di Cai</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Gao,+T">Tian Gao</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Wan,+G">Genshun Wan</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Ma,+F">Feng Ma</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Pan,+J">Jia Pan</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Gao,+J">Jianqing Gao</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span>; Sound (cs.SD) </div> </div> </dd> <dt> <a name='item13'>[13]</a> <a href ="/abs/2409.02302" title="Abstract" id="2409.02302"> arXiv:2409.02302 </a> [<a href="/pdf/2409.02302" title="Download PDF" id="pdf-2409.02302" aria-labelledby="pdf-2409.02302">pdf</a>, <a href="https://arxiv.org/html/2409.02302v1" title="View HTML" id="html-2409.02302" aria-labelledby="html-2409.02302" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2409.02302" title="Other formats" id="oth-2409.02302" aria-labelledby="oth-2409.02302">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Speech Foundation Model Ensembles for the Controlled Singing Voice Deepfake Detection (CtrSVDD) Challenge 2024 </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Guragain,+A">Anmol Guragain</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Liu,+T">Tianchi Liu</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Pan,+Z">Zihan Pan</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Sailor,+H+B">Hardik B. Sailor</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Wang,+Q">Qiongqiong Wang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted to the IEEE Spoken Language Technology Workshop (SLT) 2024 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span>; Artificial Intelligence (cs.AI); Sound (cs.SD) </div> </div> </dd> <dt> <a name='item14'>[14]</a> <a href ="/abs/2409.02451" title="Abstract" id="2409.02451"> arXiv:2409.02451 </a> [<a href="/pdf/2409.02451" title="Download PDF" id="pdf-2409.02451" aria-labelledby="pdf-2409.02451">pdf</a>, <a href="https://arxiv.org/html/2409.02451v1" title="View HTML" id="html-2409.02451" aria-labelledby="html-2409.02451" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2409.02451" title="Other formats" id="oth-2409.02451" aria-labelledby="oth-2409.02451">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Fast, High-Quality and Parameter-Efficient Articulatory Synthesis using Differentiable DSP </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Liu,+Y">Yisi Liu</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Yu,+B">Bohan Yu</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Lin,+D">Drake Lin</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Wu,+P">Peter Wu</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Cho,+C+J">Cheol Jun Cho</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Anumanchipalli,+G+K">Gopala Krishna Anumanchipalli</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> accepted for Spoken Language Technology Workshop 2024 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span>; Artificial Intelligence (cs.AI); Sound (cs.SD) </div> </div> </dd> <dt> <a name='item15'>[15]</a> <a href ="/abs/2409.02466" title="Abstract" id="2409.02466"> arXiv:2409.02466 </a> [<a href="/pdf/2409.02466" title="Download PDF" id="pdf-2409.02466" aria-labelledby="pdf-2409.02466">pdf</a>, <a href="https://arxiv.org/html/2409.02466v1" title="View HTML" id="html-2409.02466" aria-labelledby="html-2409.02466" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2409.02466" title="Other formats" id="oth-2409.02466" aria-labelledby="oth-2409.02466">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> CUEMPATHY: A Counseling Speech Dataset for Psychotherapy Research </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Tao,+D">Dehua Tao</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Chui,+H">Harold Chui</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Luk,+S">Sarah Luk</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Lee,+T">Tan Lee</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted by ISCSLP 2022 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span>; Sound (cs.SD) </div> </div> </dd> <dt> <a name='item16'>[16]</a> <a href ="/abs/2409.02565" title="Abstract" id="2409.02565"> arXiv:2409.02565 </a> [<a href="/pdf/2409.02565" title="Download PDF" id="pdf-2409.02565" aria-labelledby="pdf-2409.02565">pdf</a>, <a href="https://arxiv.org/html/2409.02565v1" title="View HTML" id="html-2409.02565" aria-labelledby="html-2409.02565" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2409.02565" title="Other formats" id="oth-2409.02565" aria-labelledby="oth-2409.02565">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Efficient Extraction of Noise-Robust Discrete Units from Self-Supervised Speech Models </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Poncelet,+J">Jakob Poncelet</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Wang,+Y">Yujun Wang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Van+hamme,+H">Hugo Van hamme</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted at SLT2024 </div> <div class='list-journal-ref'><span class='descriptor'>Journal-ref:</span> 2024 IEEE Spoken Language Technology Workshop (SLT), pp. 200-207 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span>; Sound (cs.SD) </div> </div> </dd> <dt> <a name='item17'>[17]</a> <a href ="/abs/2409.02615" title="Abstract" id="2409.02615"> arXiv:2409.02615 </a> [<a href="/pdf/2409.02615" title="Download PDF" id="pdf-2409.02615" aria-labelledby="pdf-2409.02615">pdf</a>, <a href="https://arxiv.org/html/2409.02615v1" title="View HTML" id="html-2409.02615" aria-labelledby="html-2409.02615" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2409.02615" title="Other formats" id="oth-2409.02615" aria-labelledby="oth-2409.02615">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> USEF-TSE: Universal Speaker Embedding Free Target Speaker Extraction </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Zeng,+B">Bang Zeng</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Li,+M">Ming Li</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 13 pages, 6 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span>; Sound (cs.SD) </div> </div> </dd> <dt> <a name='item18'>[18]</a> <a href ="/abs/2409.03269" title="Abstract" id="2409.03269"> arXiv:2409.03269 </a> [<a href="/pdf/2409.03269" title="Download PDF" id="pdf-2409.03269" aria-labelledby="pdf-2409.03269">pdf</a>, <a href="https://arxiv.org/html/2409.03269v1" title="View HTML" id="html-2409.03269" aria-labelledby="html-2409.03269" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2409.03269" title="Other formats" id="oth-2409.03269" aria-labelledby="oth-2409.03269">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> A spherical harmonic-domain spatial audio signal enhancement method based on minimum variance distortionless response </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Zhang,+H">Huawei Zhang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Jihui">Jihui</a> (Aimee)<a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Zhang">Zhang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Huiyuan">Huiyuan</a> (June)Sun, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Samarasinghe,+P">Prasanga Samarasinghe</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span> </div> </div> </dd> <dt> <a name='item19'>[19]</a> <a href ="/abs/2409.03520" title="Abstract" id="2409.03520"> arXiv:2409.03520 </a> [<a href="/pdf/2409.03520" title="Download PDF" id="pdf-2409.03520" aria-labelledby="pdf-2409.03520">pdf</a>, <a href="https://arxiv.org/html/2409.03520v1" title="View HTML" id="html-2409.03520" aria-labelledby="html-2409.03520" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2409.03520" title="Other formats" id="oth-2409.03520" aria-labelledby="oth-2409.03520">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Speaker and Style Disentanglement of Speech Based on Contrastive Predictive Coding Supported Factorized Variational Autoencoder </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Xie,+Y">Yuying Xie</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Kuhlmann,+M">Michael Kuhlmann</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Rautenberg,+F">Frederik Rautenberg</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Tan,+Z">Zheng-Hua Tan</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Haeb-Umbach,+R">Reinhold Haeb-Umbach</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted by EUSIPCO 2024 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span>; Signal Processing (eess.SP) </div> </div> </dd> <dt> <a name='item20'>[20]</a> <a href ="/abs/2409.03610" title="Abstract" id="2409.03610"> arXiv:2409.03610 </a> [<a href="/pdf/2409.03610" title="Download PDF" id="pdf-2409.03610" aria-labelledby="pdf-2409.03610">pdf</a>, <a href="https://arxiv.org/html/2409.03610v1" title="View HTML" id="html-2409.03610" aria-labelledby="html-2409.03610" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2409.03610" title="Other formats" id="oth-2409.03610" aria-labelledby="oth-2409.03610">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> A Dual-Path Framework with Frequency-and-Time Excited Network for Anomalous Sound Detection </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Zhang,+Y">Yucong Zhang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Liu,+J">Juan Liu</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Tian,+Y">Yao Tian</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Liu,+H">Haifeng Liu</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Li,+M">Ming Li</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> This Paper has been accepted to ICASSP 2024 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span> </div> </div> </dd> <dt> <a name='item21'>[21]</a> <a href ="/abs/2409.03636" title="Abstract" id="2409.03636"> arXiv:2409.03636 </a> [<a href="/pdf/2409.03636" title="Download PDF" id="pdf-2409.03636" aria-labelledby="pdf-2409.03636">pdf</a>, <a href="https://arxiv.org/html/2409.03636v3" title="View HTML" id="html-2409.03636" aria-labelledby="html-2409.03636" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2409.03636" title="Other formats" id="oth-2409.03636" aria-labelledby="oth-2409.03636">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Toward Any-to-Any Emotion Voice Conversion using Disentangled Diffusion Framework </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Chou,+H">Hsing-Hang Chou</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Lin,+Y">Yun-Shao Lin</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Sung,+C">Ching-Chin Sung</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Tsao,+Y">Yu Tsao</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Lee,+C">Chi-Chun Lee</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 5 pages; revised arguments, typos and references corrected </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span> </div> </div> </dd> <dt> <a name='item22'>[22]</a> <a href ="/abs/2409.03655" title="Abstract" id="2409.03655"> arXiv:2409.03655 </a> [<a href="/pdf/2409.03655" title="Download PDF" id="pdf-2409.03655" aria-labelledby="pdf-2409.03655">pdf</a>, <a href="https://arxiv.org/html/2409.03655v1" title="View HTML" id="html-2409.03655" aria-labelledby="html-2409.03655" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2409.03655" title="Other formats" id="oth-2409.03655" aria-labelledby="oth-2409.03655">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Privacy versus Emotion Preservation Trade-offs in Emotion-Preserving Speaker Anonymization </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Cai,+Z">Zexin Cai</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Xinyuan,+H+L">Henry Li Xinyuan</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Garg,+A">Ashi Garg</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Garc%C3%ADa-Perera,+L+P">Leibny Paola Garc铆a-Perera</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Duh,+K">Kevin Duh</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Khudanpur,+S">Sanjeev Khudanpur</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Andrews,+N">Nicholas Andrews</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Wiesner,+M">Matthew Wiesner</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> accepted by 2024 IEEE Spoken Language Technology Workshop </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span>; Machine Learning (cs.LG) </div> </div> </dd> <dt> <a name='item23'>[23]</a> <a href ="/abs/2409.04014" title="Abstract" id="2409.04014"> arXiv:2409.04014 </a> [<a href="/pdf/2409.04014" title="Download PDF" id="pdf-2409.04014" aria-labelledby="pdf-2409.04014">pdf</a>, <a href="/format/2409.04014" title="Other formats" id="oth-2409.04014" aria-labelledby="oth-2409.04014">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Development of the Listening in Spatialized Noise-Sentences (LiSN-S) Test in Brazilian Portuguese: Presentation Software, Speech Stimuli, and Sentence Equivalence </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Masiero,+B+S">Bruno S. Masiero</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Borges,+L+R">Leticia R. Borges</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Dillon,+H">Harvey Dillon</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Colella-Santos,+M+F">Maria Francisca Colella-Santos</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span>; Sound (cs.SD) </div> </div> </dd> <dt> <a name='item24'>[24]</a> <a href ="/abs/2409.04136" title="Abstract" id="2409.04136"> arXiv:2409.04136 </a> [<a href="/pdf/2409.04136" title="Download PDF" id="pdf-2409.04136" aria-labelledby="pdf-2409.04136">pdf</a>, <a href="https://arxiv.org/html/2409.04136v2" title="View HTML" id="html-2409.04136" aria-labelledby="html-2409.04136" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2409.04136" title="Other formats" id="oth-2409.04136" aria-labelledby="oth-2409.04136">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Low-Complexity Own Voice Reconstruction for Hearables with an In-Ear Microphone </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Ohlenbusch,+M">Mattes Ohlenbusch</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Rollwage,+C">Christian Rollwage</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Doclo,+S">Simon Doclo</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 5 pages, 3 figures, submitted to ICASSP 2025; typos corrected </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span> </div> </div> </dd> <dt> <a name='item25'>[25]</a> <a href ="/abs/2409.04173" title="Abstract" id="2409.04173"> arXiv:2409.04173 </a> [<a href="/pdf/2409.04173" title="Download PDF" id="pdf-2409.04173" aria-labelledby="pdf-2409.04173">pdf</a>, <a href="https://arxiv.org/html/2409.04173v2" title="View HTML" id="html-2409.04173" aria-labelledby="html-2409.04173" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2409.04173" title="Other formats" id="oth-2409.04173" aria-labelledby="oth-2409.04173">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> NPU-NTU System for Voice Privacy 2024 Challenge </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Yao,+J">Jixun Yao</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Kuzmin,+N">Nikita Kuzmin</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Wang,+Q">Qing Wang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Guo,+P">Pengcheng Guo</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Ning,+Z">Ziqian Ning</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Guo,+D">Dake Guo</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Lee,+K+A">Kong Aik Lee</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Chng,+E">Eng-Siong Chng</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Xie,+L">Lei Xie</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> System description for VPC 2024 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span> </div> </div> </dd> <dt> <a name='item26'>[26]</a> <a href ="/abs/2409.04803" title="Abstract" id="2409.04803"> arXiv:2409.04803 </a> [<a href="/pdf/2409.04803" title="Download PDF" id="pdf-2409.04803" aria-labelledby="pdf-2409.04803">pdf</a>, <a href="https://arxiv.org/html/2409.04803v4" title="View HTML" id="html-2409.04803" aria-labelledby="html-2409.04803" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2409.04803" title="Other formats" id="oth-2409.04803" aria-labelledby="oth-2409.04803">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Cross-attention Inspired Selective State Space Models for Target Sound Extraction </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Wu,+D">Donghang Wu</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Wang,+Y">Yiwen Wang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Wu,+X">Xihong Wu</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Qu,+T">Tianshu Qu</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 5 pages, 2 figures, accepted by ICASSP 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span>; Sound (cs.SD) </div> </div> </dd> <dt> <a name='item27'>[27]</a> <a href ="/abs/2409.04843" title="Abstract" id="2409.04843"> arXiv:2409.04843 </a> [<a href="/pdf/2409.04843" title="Download PDF" id="pdf-2409.04843" aria-labelledby="pdf-2409.04843">pdf</a>, <a href="https://arxiv.org/html/2409.04843v1" title="View HTML" id="html-2409.04843" aria-labelledby="html-2409.04843" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2409.04843" title="Other formats" id="oth-2409.04843" aria-labelledby="oth-2409.04843">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Leveraging Moving Sound Source Trajectories for Universal Sound Separation </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Wu,+D">Donghang Wu</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Wu,+X">Xihong Wu</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Qu,+T">Tianshu Qu</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 9 pages,7 figures,submitted to IEEE/ACM Transactions on Audio, Speech and Language Processing(TASLP) </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span>; Sound (cs.SD) </div> </div> </dd> <dt> <a name='item28'>[28]</a> <a href ="/abs/2409.05032" title="Abstract" id="2409.05032"> arXiv:2409.05032 </a> [<a href="/pdf/2409.05032" title="Download PDF" id="pdf-2409.05032" aria-labelledby="pdf-2409.05032">pdf</a>, <a href="https://arxiv.org/html/2409.05032v1" title="View HTML" id="html-2409.05032" aria-labelledby="html-2409.05032" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2409.05032" title="Other formats" id="oth-2409.05032" aria-labelledby="oth-2409.05032">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Exploring WavLM Back-ends for Speech Spoofing and Deepfake Detection </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Stourbe,+T">Theophile Stourbe</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Miara,+V">Victor Miara</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Lepage,+T">Theo Lepage</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Dehak,+R">Reda Dehak</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span>; Machine Learning (cs.LG); Sound (cs.SD) </div> </div> </dd> <dt> <a name='item29'>[29]</a> <a href ="/abs/2409.05034" title="Abstract" id="2409.05034"> arXiv:2409.05034 </a> [<a href="/pdf/2409.05034" title="Download PDF" id="pdf-2409.05034" aria-labelledby="pdf-2409.05034">pdf</a>, <a href="https://arxiv.org/html/2409.05034v1" title="View HTML" id="html-2409.05034" aria-labelledby="html-2409.05034" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2409.05034" title="Other formats" id="oth-2409.05034" aria-labelledby="oth-2409.05034">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> TF-Mamba: A Time-Frequency Network for Sound Source Localization </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Xiao,+Y">Yang Xiao</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Das,+R+K">Rohan Kumar Das</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Submitted to ICASSP 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span>; Sound (cs.SD) </div> </div> </dd> <dt> <a name='item30'>[30]</a> <a href ="/abs/2409.05116" title="Abstract" id="2409.05116"> arXiv:2409.05116 </a> [<a href="/pdf/2409.05116" title="Download PDF" id="pdf-2409.05116" aria-labelledby="pdf-2409.05116">pdf</a>, <a href="https://arxiv.org/html/2409.05116v2" title="View HTML" id="html-2409.05116" aria-labelledby="html-2409.05116" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2409.05116" title="Other formats" id="oth-2409.05116" aria-labelledby="oth-2409.05116">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Diffusion-based Speech Enhancement with Schr枚dinger Bridge and Symmetric Noise Schedule </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Wang,+S">Siyi Wang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Liu,+S">Siyi Liu</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Harper,+A">Andrew Harper</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Kendrick,+P">Paul Kendrick</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Salzmann,+M">Mathieu Salzmann</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Cernak,+M">Milos Cernak</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span>; Sound (cs.SD) </div> </div> </dd> <dt> <a name='item31'>[31]</a> <a href ="/abs/2409.05212" title="Abstract" id="2409.05212"> arXiv:2409.05212 </a> [<a href="/pdf/2409.05212" title="Download PDF" id="pdf-2409.05212" aria-labelledby="pdf-2409.05212">pdf</a>, <a href="https://arxiv.org/html/2409.05212v1" title="View HTML" id="html-2409.05212" aria-labelledby="html-2409.05212" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2409.05212" title="Other formats" id="oth-2409.05212" aria-labelledby="oth-2409.05212">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> SS-BRPE: Self-Supervised Blind Room Parameter Estimation Using Attention Mechanisms </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Wang,+C">Chunxi Wang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Jia,+M">Maoshen Jia</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Li,+M">Meiran Li</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Bao,+C">Changchun Bao</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Jin,+W">Wenyu Jin</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 5 pages, 3 figures, submitted to ICASSP 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span>; Sound (cs.SD) </div> </div> </dd> <dt> <a name='item32'>[32]</a> <a href ="/abs/2409.05377" title="Abstract" id="2409.05377"> arXiv:2409.05377 </a> [<a href="/pdf/2409.05377" title="Download PDF" id="pdf-2409.05377" aria-labelledby="pdf-2409.05377">pdf</a>, <a href="https://arxiv.org/html/2409.05377v1" title="View HTML" id="html-2409.05377" aria-labelledby="html-2409.05377" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2409.05377" title="Other formats" id="oth-2409.05377" aria-labelledby="oth-2409.05377">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> BigCodec: Pushing the Limits of Low-Bitrate Neural Speech Codec </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Xin,+D">Detai Xin</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Tan,+X">Xu Tan</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Takamichi,+S">Shinnosuke Takamichi</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Saruwatari,+H">Hiroshi Saruwatari</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 4 pages, 1 figure. Audio samples available at: <a href="https://aria-k-alethia.github.io/bigcodec-demo/" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span>; Sound (cs.SD) </div> </div> </dd> <dt> <a name='item33'>[33]</a> <a href ="/abs/2409.05430" title="Abstract" id="2409.05430"> arXiv:2409.05430 </a> [<a href="/pdf/2409.05430" title="Download PDF" id="pdf-2409.05430" aria-labelledby="pdf-2409.05430">pdf</a>, <a href="https://arxiv.org/html/2409.05430v1" title="View HTML" id="html-2409.05430" aria-labelledby="html-2409.05430" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2409.05430" title="Other formats" id="oth-2409.05430" aria-labelledby="oth-2409.05430">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Findings of the 2024 Mandarin Stuttering Event Detection and Automatic Speech Recognition Challenge </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Xue,+H">Hongfei Xue</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Gong,+R">Rong Gong</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Shao,+M">Mingchen Shao</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Xu,+X">Xin Xu</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Wang,+L">Lezhi Wang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Xie,+L">Lei Xie</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Bu,+H">Hui Bu</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Zhou,+J">Jiaming Zhou</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Qin,+Y">Yong Qin</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Du,+J">Jun Du</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Li,+M">Ming Li</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Zhang,+B">Binbin Zhang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Jia,+B">Bin Jia</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 8 pages, 2 figures, accepted by SLT 2024 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span>; Sound (cs.SD) </div> </div> </dd> <dt> <a name='item34'>[34]</a> <a href ="/abs/2409.05554" title="Abstract" id="2409.05554"> arXiv:2409.05554 </a> [<a href="/pdf/2409.05554" title="Download PDF" id="pdf-2409.05554" aria-labelledby="pdf-2409.05554">pdf</a>, <a href="https://arxiv.org/html/2409.05554v1" title="View HTML" id="html-2409.05554" aria-labelledby="html-2409.05554" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2409.05554" title="Other formats" id="oth-2409.05554" aria-labelledby="oth-2409.05554">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> NTT Multi-Speaker ASR System for the DASR Task of CHiME-8 Challenge </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Kamo,+N">Naoyuki Kamo</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Tawara,+N">Naohiro Tawara</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Ando,+A">Atsushi Ando</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Kano,+T">Takatomo Kano</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Sato,+H">Hiroshi Sato</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Ikeshita,+R">Rintaro Ikeshita</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Moriya,+T">Takafumi Moriya</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Horiguchi,+S">Shota Horiguchi</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Matsuura,+K">Kohei Matsuura</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Ogawa,+A">Atsunori Ogawa</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Plaquet,+A">Alexis Plaquet</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Ashihara,+T">Takanori Ashihara</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Ochiai,+T">Tsubasa Ochiai</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Mimura,+M">Masato Mimura</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Delcroix,+M">Marc Delcroix</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Nakatani,+T">Tomohiro Nakatani</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Asami,+T">Taichi Asami</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Araki,+S">Shoko Araki</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 5 pages, 4 figures, CHiME8 challenge </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span> </div> </div> </dd> <dt> <a name='item35'>[35]</a> <a href ="/abs/2409.05566" title="Abstract" id="2409.05566"> arXiv:2409.05566 </a> [<a href="/pdf/2409.05566" title="Download PDF" id="pdf-2409.05566" aria-labelledby="pdf-2409.05566">pdf</a>, <a href="https://arxiv.org/html/2409.05566v2" title="View HTML" id="html-2409.05566" aria-labelledby="html-2409.05566" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2409.05566" title="Other formats" id="oth-2409.05566" aria-labelledby="oth-2409.05566">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Leveraging Content and Acoustic Representations for Speech Emotion Recognition </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Dutta,+S">Soumya Dutta</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Ganapathy,+S">Sriram Ganapathy</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 11 pages, 5 figures, 6 tables </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span> </div> </div> </dd> <dt> <a name='item36'>[36]</a> <a href ="/abs/2409.05589" title="Abstract" id="2409.05589"> arXiv:2409.05589 </a> [<a href="/pdf/2409.05589" title="Download PDF" id="pdf-2409.05589" aria-labelledby="pdf-2409.05589">pdf</a>, <a href="https://arxiv.org/html/2409.05589v1" title="View HTML" id="html-2409.05589" aria-labelledby="html-2409.05589" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2409.05589" title="Other formats" id="oth-2409.05589" aria-labelledby="oth-2409.05589">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> An investigation of modularity for noise robustness in conformer-based ASR </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&amp;query=de+Gibson,+L+C">Louise Coppieters de Gibson</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Garner,+P+N">Philip N. Garner</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Honnet,+P">Pierre-Edouard Honnet</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 5 pages, 3 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span> </div> </div> </dd> <dt> <a name='item37'>[37]</a> <a href ="/abs/2409.05601" title="Abstract" id="2409.05601"> arXiv:2409.05601 </a> [<a href="/pdf/2409.05601" title="Download PDF" id="pdf-2409.05601" aria-labelledby="pdf-2409.05601">pdf</a>, <a href="https://arxiv.org/html/2409.05601v1" title="View HTML" id="html-2409.05601" aria-labelledby="html-2409.05601" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2409.05601" title="Other formats" id="oth-2409.05601" aria-labelledby="oth-2409.05601">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Longer is (Not Necessarily) Stronger: Punctuated Long-Sequence Training for Enhanced Speech Recognition and Translation </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Koluguri,+N+R">Nithin Rao Koluguri</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Bartley,+T">Travis Bartley</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Xu,+H">Hainan Xu</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Hrinchuk,+O">Oleksii Hrinchuk</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Balam,+J">Jagadeesh Balam</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Ginsburg,+B">Boris Ginsburg</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Kucsko,+G">Georg Kucsko</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted at SLT 2024 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span>; Computation and Language (cs.CL) </div> </div> </dd> <dt> <a name='item38'>[38]</a> <a href ="/abs/2409.05730" title="Abstract" id="2409.05730"> arXiv:2409.05730 </a> [<a href="/pdf/2409.05730" title="Download PDF" id="pdf-2409.05730" aria-labelledby="pdf-2409.05730">pdf</a>, <a href="https://arxiv.org/html/2409.05730v1" title="View HTML" id="html-2409.05730" aria-labelledby="html-2409.05730" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2409.05730" title="Other formats" id="oth-2409.05730" aria-labelledby="oth-2409.05730">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> AS-Speech: Adaptive Style For Speech Synthesis </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Li,+Z">Zhipeng Li</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Xing,+X">Xiaofen Xing</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Wang,+J">Jun Wang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Chen,+S">Shuaiqi Chen</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Yu,+G">Guoqiao Yu</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Wan,+G">Guanglu Wan</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Xu,+X">Xiangmin Xu</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted by SLT 2024 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span> </div> </div> </dd> <dt> <a name='item39'>[39]</a> <a href ="/abs/2409.05750" title="Abstract" id="2409.05750"> arXiv:2409.05750 </a> [<a href="/pdf/2409.05750" title="Download PDF" id="pdf-2409.05750" aria-labelledby="pdf-2409.05750">pdf</a>, <a href="https://arxiv.org/html/2409.05750v1" title="View HTML" id="html-2409.05750" aria-labelledby="html-2409.05750" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2409.05750" title="Other formats" id="oth-2409.05750" aria-labelledby="oth-2409.05750">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> A Toolkit for Joint Speaker Diarization and Identification with Application to Speaker-Attributed ASR </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Morrone,+G">Giovanni Morrone</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Zovato,+E">Enrico Zovato</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Brugnara,+F">Fabio Brugnara</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Sartori,+E">Enrico Sartori</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Badino,+L">Leonardo Badino</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Show and Tell paper. Presented at Interspeech 2024 </div> <div class='list-journal-ref'><span class='descriptor'>Journal-ref:</span> Proceedings of Interspeech 2024, pp. 3652--3653 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span>; Multimedia (cs.MM) </div> </div> </dd> <dt> <a name='item40'>[40]</a> <a href ="/abs/2409.05910" title="Abstract" id="2409.05910"> arXiv:2409.05910 </a> [<a href="/pdf/2409.05910" title="Download PDF" id="pdf-2409.05910" aria-labelledby="pdf-2409.05910">pdf</a>, <a href="https://arxiv.org/html/2409.05910v2" title="View HTML" id="html-2409.05910" aria-labelledby="html-2409.05910" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2409.05910" title="Other formats" id="oth-2409.05910" aria-labelledby="oth-2409.05910">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Property Neurons in Self-Supervised Speech Transformers </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Lin,+T">Tzu-Quan Lin</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Lin,+G">Guan-Ting Lin</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Lee,+H">Hung-yi Lee</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Tang,+H">Hao Tang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted by SLT 2024 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span>; Computation and Language (cs.CL); Machine Learning (cs.LG); Sound (cs.SD) </div> </div> </dd> <dt> <a name='item41'>[41]</a> <a href ="/abs/2409.06062" title="Abstract" id="2409.06062"> arXiv:2409.06062 </a> [<a href="/pdf/2409.06062" title="Download PDF" id="pdf-2409.06062" aria-labelledby="pdf-2409.06062">pdf</a>, <a href="https://arxiv.org/html/2409.06062v1" title="View HTML" id="html-2409.06062" aria-labelledby="html-2409.06062" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2409.06062" title="Other formats" id="oth-2409.06062" aria-labelledby="oth-2409.06062">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Retrieval Augmented Correction of Named Entity Speech Recognition Errors </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Pusateri,+E">Ernest Pusateri</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Walia,+A">Anmol Walia</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Kashi,+A">Anirudh Kashi</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Bandyopadhyay,+B">Bortik Bandyopadhyay</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Hyder,+N">Nadia Hyder</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Mahinder,+S">Sayantan Mahinder</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Anantha,+R">Raviteja Anantha</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Liu,+D">Daben Liu</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Gondala,+S">Sashank Gondala</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Submitted to ICASSP 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span>; Sound (cs.SD) </div> </div> </dd> <dt> <a name='item42'>[42]</a> <a href ="/abs/2409.06109" title="Abstract" id="2409.06109"> arXiv:2409.06109 </a> [<a href="/pdf/2409.06109" title="Download PDF" id="pdf-2409.06109" aria-labelledby="pdf-2409.06109">pdf</a>, <a href="https://arxiv.org/html/2409.06109v2" title="View HTML" id="html-2409.06109" aria-labelledby="html-2409.06109" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2409.06109" title="Other formats" id="oth-2409.06109" aria-labelledby="oth-2409.06109">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Estimating the Completeness of Discrete Speech Units </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Yeh,+S">Sung-Lin Yeh</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Tang,+H">Hao Tang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> SLT2024 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span>; Computation and Language (cs.CL) </div> </div> </dd> <dt> <a name='item43'>[43]</a> <a href ="/abs/2409.06126" title="Abstract" id="2409.06126"> arXiv:2409.06126 </a> [<a href="/pdf/2409.06126" title="Download PDF" id="pdf-2409.06126" aria-labelledby="pdf-2409.06126">pdf</a>, <a href="https://arxiv.org/html/2409.06126v1" title="View HTML" id="html-2409.06126" aria-labelledby="html-2409.06126" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2409.06126" title="Other formats" id="oth-2409.06126" aria-labelledby="oth-2409.06126">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> VC-ENHANCE: Speech Restoration with Integrated Noise Suppression and Voice Conversion </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Byun,+K">Kyungguen Byun</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Filos,+J">Jason Filos</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Visser,+E">Erik Visser</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Moon,+S">Sunkuk Moon</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 5 pages, 3 figures, submitted to ICASSP 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span>; Sound (cs.SD) </div> </div> </dd> <dt> <a name='item44'>[44]</a> <a href ="/abs/2409.06137" title="Abstract" id="2409.06137"> arXiv:2409.06137 </a> [<a href="/pdf/2409.06137" title="Download PDF" id="pdf-2409.06137" aria-labelledby="pdf-2409.06137">pdf</a>, <a href="https://arxiv.org/html/2409.06137v1" title="View HTML" id="html-2409.06137" aria-labelledby="html-2409.06137" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2409.06137" title="Other formats" id="oth-2409.06137" aria-labelledby="oth-2409.06137">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> DeWinder: Single-Channel Wind Noise Reduction using Ultrasound Sensing </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Yuan,+K">Kuang Yuan</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Han,+S">Shuo Han</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Kumar,+S">Swarun Kumar</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Raj,+B">Bhiksha Raj</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span>; Sound (cs.SD); Signal Processing (eess.SP) </div> </div> </dd> <dt> <a name='item45'>[45]</a> <a href ="/abs/2409.06190" title="Abstract" id="2409.06190"> arXiv:2409.06190 </a> [<a href="/pdf/2409.06190" title="Download PDF" id="pdf-2409.06190" aria-labelledby="pdf-2409.06190">pdf</a>, <a href="https://arxiv.org/html/2409.06190v3" title="View HTML" id="html-2409.06190" aria-labelledby="html-2409.06190" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2409.06190" title="Other formats" id="oth-2409.06190" aria-labelledby="oth-2409.06190">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Multi-Source Music Generation with Latent Diffusion </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Xu,+Z">Zhongweiyang Xu</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Dutta,+D">Debottam Dutta</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Wei,+Y">Yu-Lin Wei</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Choudhury,+R+R">Romit Roy Choudhury</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> ICASSP 2025 in Submission </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span>; Machine Learning (cs.LG); Sound (cs.SD) </div> </div> </dd> <dt> <a name='item46'>[46]</a> <a href ="/abs/2409.06327" title="Abstract" id="2409.06327"> arXiv:2409.06327 </a> [<a href="/pdf/2409.06327" title="Download PDF" id="pdf-2409.06327" aria-labelledby="pdf-2409.06327">pdf</a>, <a href="https://arxiv.org/html/2409.06327v1" title="View HTML" id="html-2409.06327" aria-labelledby="html-2409.06327" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2409.06327" title="Other formats" id="oth-2409.06327" aria-labelledby="oth-2409.06327">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Spoofing-Aware Speaker Verification Robust Against Domain and Channel Mismatches </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Zeng,+C">Chang Zeng</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Miao,+X">Xiaoxiao Miao</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Wang,+X">Xin Wang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Cooper,+E">Erica Cooper</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Yamagishi,+J">Junichi Yamagishi</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> To appear in 2024 IEEE Spoken Language Technology Workshop, Dec 02-05, 2024, Macao, China </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span>; Sound (cs.SD) </div> </div> </dd> <dt> <a name='item47'>[47]</a> <a href ="/abs/2409.06330" title="Abstract" id="2409.06330"> arXiv:2409.06330 </a> [<a href="/pdf/2409.06330" title="Download PDF" id="pdf-2409.06330" aria-labelledby="pdf-2409.06330">pdf</a>, <a href="https://arxiv.org/html/2409.06330v1" title="View HTML" id="html-2409.06330" aria-labelledby="html-2409.06330" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2409.06330" title="Other formats" id="oth-2409.06330" aria-labelledby="oth-2409.06330">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> InstructSing: High-Fidelity Singing Voice Generation via Instructing Yourself </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Zeng,+C">Chang Zeng</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Wang,+C">Chunhui Wang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Miao,+X">Xiaoxiao Miao</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Zhao,+J">Jian Zhao</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Jiang,+Z">Zhonglin Jiang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Chen,+Y">Yong Chen</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> To appear in 2024 IEEE Spoken Language Technology Workshop, Dec 02-05, 2024, Macao, China </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span>; Sound (cs.SD) </div> </div> </dd> <dt> <a name='item48'>[48]</a> <a href ="/abs/2409.06392" title="Abstract" id="2409.06392"> arXiv:2409.06392 </a> [<a href="/pdf/2409.06392" title="Download PDF" id="pdf-2409.06392" aria-labelledby="pdf-2409.06392">pdf</a>, <a href="https://arxiv.org/html/2409.06392v2" title="View HTML" id="html-2409.06392" aria-labelledby="html-2409.06392" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2409.06392" title="Other formats" id="oth-2409.06392" aria-labelledby="oth-2409.06392">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Janssen 2.0: Audio Inpainting in the Time-frequency Domain </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Mokr%C3%BD,+O">Ond艡ej Mokr媒</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Balu%C5%A1%C3%ADk,+P">Peter Balu拧铆k</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Rajmic,+P">Pavel Rajmic</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Submitted to EUSIPCO 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span>; Sound (cs.SD) </div> </div> </dd> <dt> <a name='item49'>[49]</a> <a href ="/abs/2409.06580" title="Abstract" id="2409.06580"> arXiv:2409.06580 </a> [<a href="/pdf/2409.06580" title="Download PDF" id="pdf-2409.06580" aria-labelledby="pdf-2409.06580">pdf</a>, <a href="https://arxiv.org/html/2409.06580v1" title="View HTML" id="html-2409.06580" aria-labelledby="html-2409.06580" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2409.06580" title="Other formats" id="oth-2409.06580" aria-labelledby="oth-2409.06580">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Exploring Differences between Human Perception and Model Inference in Audio Event Recognition </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Tan,+Y">Yizhou Tan</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Wu,+Y">Yanru Wu</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Hou,+Y">Yuanbo Hou</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Xu,+X">Xin Xu</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Bu,+H">Hui Bu</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Li,+S">Shengchen Li</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Botteldooren,+D">Dick Botteldooren</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Plumbley,+M+D">Mark D. Plumbley</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Dataset homepage: <a href="https://github.com/Voltmeter00/MAFAR" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span>; Sound (cs.SD) </div> </div> </dd> <dt> <a name='item50'>[50]</a> <a href ="/abs/2409.06656" title="Abstract" id="2409.06656"> arXiv:2409.06656 </a> [<a href="/pdf/2409.06656" title="Download PDF" id="pdf-2409.06656" aria-labelledby="pdf-2409.06656">pdf</a>, <a href="https://arxiv.org/html/2409.06656v2" title="View HTML" id="html-2409.06656" aria-labelledby="html-2409.06656" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2409.06656" title="Other formats" id="oth-2409.06656" aria-labelledby="oth-2409.06656">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Sortformer: Seamless Integration of Speaker Diarization and ASR by Bridging Timestamps and Tokens </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Park,+T">Taejin Park</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Medennikov,+I">Ivan Medennikov</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Dhawan,+K">Kunal Dhawan</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Wang,+W">Weiqing Wang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Huang,+H">He Huang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Koluguri,+N+R">Nithin Rao Koluguri</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Puvvada,+K+C">Krishna C. Puvvada</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Balam,+J">Jagadeesh Balam</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Ginsburg,+B">Boris Ginsburg</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span>; Computation and Language (cs.CL); Machine Learning (cs.LG); Sound (cs.SD) </div> </div> </dd> </dl> <div class='paging'>Total of 541 entries : <span>1-50</span> <a href=/list/eess.AS/2024-09?skip=50&amp;show=50>51-100</a> <a href=/list/eess.AS/2024-09?skip=100&amp;show=50>101-150</a> <a href=/list/eess.AS/2024-09?skip=150&amp;show=50>151-200</a> <span>...</span> <a href=/list/eess.AS/2024-09?skip=500&amp;show=50>501-541</a> </div> <div class='morefewer'>Showing up to 50 entries per page: <a href=/list/eess.AS/2024-09?skip=0&amp;show=25 rel="nofollow"> fewer</a> | <a href=/list/eess.AS/2024-09?skip=0&amp;show=100 rel="nofollow"> more</a> | <a href=/list/eess.AS/2024-09?skip=0&amp;show=2000 rel="nofollow"> all</a> </div> </div> </div> </div> </main> <footer style="clear: both;"> <div class="columns is-desktop" role="navigation" aria-label="Secondary" style="margin: -0.75em -0.75em 0.75em -0.75em"> <!-- Macro-Column 1 --> <div class="column" style="padding: 0;"> <div class="columns"> <div class="column"> <ul style="list-style: none; line-height: 2;"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul style="list-style: none; line-height: 2;"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- End Macro-Column 1 --> <!-- Macro-Column 2 --> <div class="column" style="padding: 0;"> <div class="columns"> <div class="column"> <ul style="list-style: none; line-height: 2;"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul style="list-style: none; line-height: 2;"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> <!-- End Macro-Column 2 --> </div> </footer> </div> <script src="/static/base/1.0.1/js/member_acknowledgement.js"></script> </body> </html>

Pages: 1 2 3 4 5 6 7 8 9 10