Audio and Speech Processing

<!DOCTYPE html> <html lang="en"> <head> <title>Audio and Speech Processing </title> <meta name="viewport" content="width=device-width, initial-scale=1"> <link rel="apple-touch-icon" sizes="180x180" href="/static/browse/0.3.4/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="/static/browse/0.3.4/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="/static/browse/0.3.4/images/icons/favicon-16x16.png"> <link rel="manifest" href="/static/browse/0.3.4/images/icons/site.webmanifest"> <link rel="mask-icon" href="/static/browse/0.3.4/images/icons/safari-pinned-tab.svg" color="#5bbad5"> <meta name="msapplication-TileColor" content="#da532c"> <meta name="theme-color" content="#ffffff"> <link rel="stylesheet" type="text/css" media="screen" href="/static/browse/0.3.4/css/arXiv.css?v=20241206" /> <link rel="stylesheet" type="text/css" media="print" href="/static/browse/0.3.4/css/arXiv-print.css?v=20200611" /> <link rel="stylesheet" type="text/css" media="screen" href="/static/browse/0.3.4/css/browse_search.css" /> <script language="javascript" src="/static/browse/0.3.4/js/accordion.js" /></script> <script src="/static/browse/0.3.4/js/mathjaxToggle.min.js" type="text/javascript"></script> <script type="text/javascript" language="javascript">mathjaxToggle();</script> </head> <body class="with-cu-identity"> <div class="flex-wrap-footer"> <header> <a href="#content" class="is-sr-only">Skip to main content</a>  <div class="columns is-vcentered is-hidden-mobile" id="cu-identity"> <div class="column" id="cu-logo"> <a href="https://www.cornell.edu/"><img src="/static/browse/0.3.4/images/icons/cu/cornell-reduced-white-SMALL.svg" alt="Cornell University" /></a> </div><div class="column" id="support-ack"> <span id="support-ack-url">We gratefully acknowledge support from the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors.</span> <a href="https://info.arxiv.org/about/donate.html" class="btn-header-donate">Donate</a> </div> </div> <div id="header" class="is-hidden-mobile"> <a aria-hidden="true" tabindex="-1" href="/IgnoreMe"></a> <div class="header-breadcrumbs"> <a href="/"><img src="/static/browse/0.3.4/images/arxiv-logo-one-color-white.svg" alt="arxiv logo" style="height:40px;"/></a> <span>></span> <a href="/list/eess.AS/recent">eess.AS</a> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <div class="mobile-header"> <div class="columns is-mobile"> <div class="column logo-arxiv"><a href="https://arxiv.org/"><img src="/static/browse/0.3.4/images/arxiv-logomark-small-white.svg" alt="arXiv logo" style="height:60px;" /></a></div> <div class="column logo-cornell"><a href="https://www.cornell.edu/"> <picture> <source media="(min-width: 501px)" srcset="/static/browse/0.3.4/images/icons/cu/cornell-reduced-white-SMALL.svg 400w" sizes="400w" /> <source srcset="/static/browse/0.3.4/images/icons/cu/cornell_seal_simple_black.svg 2x" /> <img src="/static/browse/0.3.4/images/icons/cu/cornell-reduced-white-SMALL.svg" alt="Cornell University Logo" /> </picture> </a></div> <div class="column nav" id="toggle-container" role="menubar"> <button class="toggle-control"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-white"><title>open search</title><path d="M505 442.7L405.3 343c-4.5-4.5-10.6-7-17-7H372c27.6-35.3 44-79.7 44-128C416 93.1 322.9 0 208 0S0 93.1 0 208s93.1 208 208 208c48.3 0 92.7-16.4 128-44v16.3c0 6.4 2.5 12.5 7 17l99.7 99.7c9.4 9.4 24.6 9.4 33.9 0l28.3-28.3c9.4-9.4 9.4-24.6.1-34zM208 336c-70.7 0-128-57.2-128-128 0-70.7 57.2-128 128-128 70.7 0 128 57.2 128 128 0 70.7-57.2 128-128 128z"/></svg></button> <div class="mobile-toggle-block toggle-target"> <form class="mobile-search-form" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <input class="input" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <input type="hidden" name="source" value="header"> <input type="hidden" name="searchtype" value="all"> <button class="button">GO</button> </div> </form> </div> <button class="toggle-control"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-white" role="menu"><title>open navigation menu</title><path d="M16 132h416c8.837 0 16-7.163 16-16V76c0-8.837-7.163-16-16-16H16C7.163 60 0 67.163 0 76v40c0 8.837 7.163 16 16 16zm0 160h416c8.837 0 16-7.163 16-16v-40c0-8.837-7.163-16-16-16H16c-8.837 0-16 7.163-16 16v40c0 8.837 7.163 16 16 16zm0 160h416c8.837 0 16-7.163 16-16v-40c0-8.837-7.163-16-16-16H16c-8.837 0-16 7.163-16 16v40c0 8.837 7.163 16 16 16z"/ ></svg></button> <div class="mobile-toggle-block toggle-target"> <nav class="mobile-menu" aria-labelledby="mobilemenulabel"> <h2 id="mobilemenulabel">quick links</h2> <ul> <li><a href="https://arxiv.org/login">Login</a></li> <li><a href="https://info.arxiv.org/help">Help Pages</a></li> <li><a href="https://info.arxiv.org/about">About</a></li> </ul> </nav> </div> </div> </div> </div> </header> <main> <div id="content"> <div id='content-inner'> <div id='dlpage'> <h1>Audio and Speech Processing</h1> <h2>Authors and titles for recent submissions</h2> <ul> <li> <a href="/list/eess.AS/recent?skip=0&show=50"> Tue, 25 Mar 2025 </a> </li><li> <a href="/list/eess.AS/recent?skip=11&show=50"> Mon, 24 Mar 2025 </a> </li><li> <a href="/list/eess.AS/recent?skip=15&show=50"> Fri, 21 Mar 2025 </a> </li><li> <a href="/list/eess.AS/recent?skip=19&show=50"> Thu, 20 Mar 2025 </a> </li><li> <a href="/list/eess.AS/recent?skip=25&show=50"> Wed, 19 Mar 2025 </a> </li></ul> <p>See today's <a id="new-eess.AS" aria-labelledby="new-eess.AS" href="/list/eess.AS/new">new</a> changes</p> <div class='paging'>Total of 29 entries </div> <div class='morefewer'>Showing up to 50 entries per page: <a href=/list/eess.AS/recent?skip=0&show=25 rel="nofollow"> fewer</a> | <span style="color: #454545">more</span> | <span style="color: #454545">all</span> </div> <dl id='articles'> <h3>Tue, 25 Mar 2025 (showing 11 of 11 entries )</h3> <dt> <a name='item1'>[1]</a> <a href ="/abs/2503.18600" title="Abstract" id="2503.18600"> arXiv:2503.18600 </a> [<a href="/pdf/2503.18600" title="Download PDF" id="pdf-2503.18600" aria-labelledby="pdf-2503.18600">pdf</a>, <a href="https://arxiv.org/html/2503.18600v1" title="View HTML" id="html-2503.18600" aria-labelledby="html-2503.18600" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.18600" title="Other formats" id="oth-2503.18600" aria-labelledby="oth-2503.18600">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Joint Spectrogram Separation and TDOA Estimation using Optimal Transport </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&query=Fabiani,+L">Linda Fabiani</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Schlecht,+S+J">Sebastian J. Schlecht</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Haasler,+I">Isabel Haasler</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Elvander,+F">Filip Elvander</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span>; Signal Processing (eess.SP) </div> </div> </dd> <dt> <a name='item2'>[2]</a> <a href ="/abs/2503.18590" title="Abstract" id="2503.18590"> arXiv:2503.18590 </a> [<a href="/pdf/2503.18590" title="Download PDF" id="pdf-2503.18590" aria-labelledby="pdf-2503.18590">pdf</a>, <a href="https://arxiv.org/html/2503.18590v1" title="View HTML" id="html-2503.18590" aria-labelledby="html-2503.18590" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.18590" title="Other formats" id="oth-2503.18590" aria-labelledby="oth-2503.18590">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Target Speaker Selection for Neural Network Beamforming in Multi-Speaker Scenarios </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&query=Fiorio,+L+V">Luan Vin铆cius Fiorio</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Defraene,+B">Bruno Defraene</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=David,+J">Johan David</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Young,+A">Alex Young</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Widdershoven,+F">Frans Widdershoven</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=van+Houtum,+W">Wim van Houtum</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Aarts,+R+M">Ronald M. Aarts</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span>; Signal Processing (eess.SP) </div> </div> </dd> <dt> <a name='item3'>[3]</a> <a href ="/abs/2503.18579" title="Abstract" id="2503.18579"> arXiv:2503.18579 </a> [<a href="/pdf/2503.18579" title="Download PDF" id="pdf-2503.18579" aria-labelledby="pdf-2503.18579">pdf</a>, <a href="https://arxiv.org/html/2503.18579v1" title="View HTML" id="html-2503.18579" aria-labelledby="html-2503.18579" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.18579" title="Other formats" id="oth-2503.18579" aria-labelledby="oth-2503.18579">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Unsupervised Variational Acoustic Clustering </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&query=Fiorio,+L+V">Luan Vin铆cius Fiorio</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Defraene,+B">Bruno Defraene</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=David,+J">Johan David</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Widdershoven,+F">Frans Widdershoven</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=van+Houtum,+W">Wim van Houtum</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Aarts,+R+M">Ronald M. Aarts</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span>; Signal Processing (eess.SP) </div> </div> </dd> <dt> <a name='item4'>[4]</a> <a href ="/abs/2503.18022" title="Abstract" id="2503.18022"> arXiv:2503.18022 </a> [<a href="/pdf/2503.18022" title="Download PDF" id="pdf-2503.18022" aria-labelledby="pdf-2503.18022">pdf</a>, <a href="https://arxiv.org/html/2503.18022v1" title="View HTML" id="html-2503.18022" aria-labelledby="html-2503.18022" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.18022" title="Other formats" id="oth-2503.18022" aria-labelledby="oth-2503.18022">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> A State-of-the-Art Review on Acoustic Preservation of Historical Worship Spaces through Auralization </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&query=Rosseel,+H">Hannes Rosseel</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=van+Waterschoot,+T">Toon van Waterschoot</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 32 pages, 7 figures, 4 tables, Published in Signal Processing </div> <div class='list-journal-ref'><span class='descriptor'>Journal-ref:</span> Signal Processing, vol. 234, p. 1-25, 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span>; Sound (cs.SD); Signal Processing (eess.SP) </div> </div> </dd> <dt> <a name='item5'>[5]</a> <a href ="/abs/2503.18928" title="Abstract" id="2503.18928"> arXiv:2503.18928 </a> (cross-list from cs.SD) [<a href="/pdf/2503.18928" title="Download PDF" id="pdf-2503.18928" aria-labelledby="pdf-2503.18928">pdf</a>, <a href="/format/2503.18928" title="Other formats" id="oth-2503.18928" aria-labelledby="oth-2503.18928">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> A Reliable and Efficient Detection Pipeline for Rodent Ultrasonic Vocalizations </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Anis,+S+S">Sabah Shahnoor Anis</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kellis,+D+M">Devin M. Kellis</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kaigler,+K+F">Kris Ford Kaigler</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wilson,+M+A">Marlene A. Wilson</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=O'Reilly,+C">Christian O'Reilly</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted for publication in the proceeding of the 7th International Conference on Advances in Signal Processing and Artificial Intelligence (ASPAI' 2025), 8-10 April 2025, Innsbruck, Austria </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Sound (cs.SD)</span>; Audio and Speech Processing (eess.AS) </div> </div> </dd> <dt> <a name='item6'>[6]</a> <a href ="/abs/2503.18880" title="Abstract" id="2503.18880"> arXiv:2503.18880 </a> (cross-list from cs.CV) [<a href="/pdf/2503.18880" title="Download PDF" id="pdf-2503.18880" aria-labelledby="pdf-2503.18880">pdf</a>, <a href="https://arxiv.org/html/2503.18880v1" title="View HTML" id="html-2503.18880" aria-labelledby="html-2503.18880" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.18880" title="Other formats" id="oth-2503.18880" aria-labelledby="oth-2503.18880">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Seeing Speech and Sound: Distinguishing and Locating Audios in Visual Scenes </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Ryu,+H">Hyeonggon Ryu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kim,+S">Seongyu Kim</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chung,+J+S">Joon Son Chung</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Senocak,+A">Arda Senocak</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> CVPR 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Sound (cs.SD); Audio and Speech Processing (eess.AS) </div> </div> </dd> <dt> <a name='item7'>[7]</a> <a href ="/abs/2503.18698" title="Abstract" id="2503.18698"> arXiv:2503.18698 </a> (cross-list from cs.SD) [<a href="/pdf/2503.18698" title="Download PDF" id="pdf-2503.18698" aria-labelledby="pdf-2503.18698">pdf</a>, <a href="https://arxiv.org/html/2503.18698v1" title="View HTML" id="html-2503.18698" aria-labelledby="html-2503.18698" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.18698" title="Other formats" id="oth-2503.18698" aria-labelledby="oth-2503.18698">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Wireless Hearables With Programmable Speech AI Accelerators </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Itani,+M">Malek Itani</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+T">Tuochao Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Raghavan,+A">Arun Raghavan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kohlberg,+G">Gavriel Kohlberg</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gollakota,+S">Shyamnath Gollakota</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Sound (cs.SD)</span>; Audio and Speech Processing (eess.AS) </div> </div> </dd> <dt> <a name='item8'>[8]</a> <a href ="/abs/2503.18486" title="Abstract" id="2503.18486"> arXiv:2503.18486 </a> (cross-list from cs.SD) [<a href="/pdf/2503.18486" title="Download PDF" id="pdf-2503.18486" aria-labelledby="pdf-2503.18486">pdf</a>, <a href="https://arxiv.org/html/2503.18486v1" title="View HTML" id="html-2503.18486" aria-labelledby="html-2503.18486" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.18486" title="Other formats" id="oth-2503.18486" aria-labelledby="oth-2503.18486">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Music Similarity Representation Learning Focusing on Individual Instruments with Source Separation and Human Preference </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Imamura,+T">Takehiro Imamura</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hashizume,+Y">Yuka Hashizume</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+W">Wen-Chin Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Toda,+T">Tomoki Toda</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Sound (cs.SD)</span>; Audio and Speech Processing (eess.AS) </div> </div> </dd> <dt> <a name='item9'>[9]</a> <a href ="/abs/2503.17886" title="Abstract" id="2503.17886"> arXiv:2503.17886 </a> (cross-list from cs.SD) [<a href="/pdf/2503.17886" title="Download PDF" id="pdf-2503.17886" aria-labelledby="pdf-2503.17886">pdf</a>, <a href="https://arxiv.org/html/2503.17886v1" title="View HTML" id="html-2503.17886" aria-labelledby="html-2503.17886" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.17886" title="Other formats" id="oth-2503.17886" aria-labelledby="oth-2503.17886">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Elevating Robust Multi-Talker ASR by Decoupling Speaker Separation and Speech Recognition </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+Y">Yufeng Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Taherian,+H">Hassan Taherian</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kalkhorani,+V+A">Vahid Ahmadi Kalkhorani</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+D">DeLiang Wang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Sound (cs.SD)</span>; Audio and Speech Processing (eess.AS) </div> </div> </dd> <dt> <a name='item10'>[10]</a> <a href ="/abs/2503.17634" title="Abstract" id="2503.17634"> arXiv:2503.17634 </a> (cross-list from eess.SY) [<a href="/pdf/2503.17634" title="Download PDF" id="pdf-2503.17634" aria-labelledby="pdf-2503.17634">pdf</a>, <a href="https://arxiv.org/html/2503.17634v1" title="View HTML" id="html-2503.17634" aria-labelledby="html-2503.17634" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.17634" title="Other formats" id="oth-2503.17634" aria-labelledby="oth-2503.17634">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Mixed-gradients Distributed Filtered Reference Least Mean Square Algorithm -- A Robust Distributed Multichannel Active Noise Control Algorithm </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&query=Ji,+J">Junwei Ji</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Shi,+D">Dongyuan Shi</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Gan,+W">Woon-Seng Gan</a></div> <div class='list-journal-ref'><span class='descriptor'>Journal-ref:</span> IEEE Transactions on Audio, Speech and Language Processing,2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Systems and Control (eess.SY)</span>; Audio and Speech Processing (eess.AS); Signal Processing (eess.SP) </div> </div> </dd> <dt> <a name='item11'>[11]</a> <a href ="/abs/2503.17551" title="Abstract" id="2503.17551"> arXiv:2503.17551 </a> (cross-list from cs.MM) [<a href="/pdf/2503.17551" title="Download PDF" id="pdf-2503.17551" aria-labelledby="pdf-2503.17551">pdf</a>, <a href="https://arxiv.org/html/2503.17551v1" title="View HTML" id="html-2503.17551" aria-labelledby="html-2503.17551" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.17551" title="Other formats" id="oth-2503.17551" aria-labelledby="oth-2503.17551">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Audio-Enhanced Vision-Language Modeling with Latent Space Broadening for High Quality Data Expansion </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Sun,+Y">Yu Sun</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+Y">Yin Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sun,+R">Ruixiao Sun</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+C">Chunhui Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+F">Fangming Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jin,+Z">Ze Jin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+L">Linjie Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shen,+X">Xiang Shen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hao,+Z">Zhuolin Hao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xiong,+H">Hongyu Xiong</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Multimedia (cs.MM)</span>; Artificial Intelligence (cs.AI); Computer Vision and Pattern Recognition (cs.CV); Sound (cs.SD); Audio and Speech Processing (eess.AS) </div> </div> </dd> </dl> <dl id='articles'> <h3>Mon, 24 Mar 2025 (showing 4 of 4 entries )</h3> <dt> <a name='item12'>[12]</a> <a href ="/abs/2503.16956" title="Abstract" id="2503.16956"> arXiv:2503.16956 </a> [<a href="/pdf/2503.16956" title="Download PDF" id="pdf-2503.16956" aria-labelledby="pdf-2503.16956">pdf</a>, <a href="https://arxiv.org/html/2503.16956v1" title="View HTML" id="html-2503.16956" aria-labelledby="html-2503.16956" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.16956" title="Other formats" id="oth-2503.16956" aria-labelledby="oth-2503.16956">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> From Faces to Voices: Learning Hierarchical Representations for High-quality Video-to-Speech </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&query=Kim,+J">Ji-Hoon Kim</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Choi,+J">Jeongsoo Choi</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Kim,+J">Jaehun Kim</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Jung,+C">Chaeyoung Jung</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Chung,+J+S">Joon Son Chung</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> CVPR 2025, demo page: <a href="https://mm.kaist.ac.kr/projects/faces2voices/" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span>; Artificial Intelligence (cs.AI); Computer Vision and Pattern Recognition (cs.CV); Sound (cs.SD) </div> </div> </dd> <dt> <a name='item13'>[13]</a> <a href ="/abs/2503.17281" title="Abstract" id="2503.17281"> arXiv:2503.17281 </a> (cross-list from cs.SD) [<a href="/pdf/2503.17281" title="Download PDF" id="pdf-2503.17281" aria-labelledby="pdf-2503.17281">pdf</a>, <a href="https://arxiv.org/html/2503.17281v1" title="View HTML" id="html-2503.17281" aria-labelledby="html-2503.17281" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.17281" title="Other formats" id="oth-2503.17281" aria-labelledby="oth-2503.17281">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Learning disentangled representations for instrument-based music similarity </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Hashizume,+Y">Yuka Hashizume</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+L">Li Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Miyashita,+A">Atsushi Miyashita</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Toda,+T">Tomoki Toda</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> arXiv admin note: text overlap with <a href="https://arxiv.org/abs/2404.06682" data-arxiv-id="2404.06682" class="link-https">arXiv:2404.06682</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Sound (cs.SD)</span>; Audio and Speech Processing (eess.AS) </div> </div> </dd> <dt> <a name='item14'>[14]</a> <a href ="/abs/2503.16853" title="Abstract" id="2503.16853"> arXiv:2503.16853 </a> (cross-list from cs.CL) [<a href="/pdf/2503.16853" title="Download PDF" id="pdf-2503.16853" aria-labelledby="pdf-2503.16853">pdf</a>, <a href="https://arxiv.org/html/2503.16853v1" title="View HTML" id="html-2503.16853" aria-labelledby="html-2503.16853" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.16853" title="Other formats" id="oth-2503.16853" aria-labelledby="oth-2503.16853">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Imagine to Hear: Auditory Knowledge Generation can be an Effective Assistant for Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Yoo,+S">Suho Yoo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ok,+H">Hyunjong Ok</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lee,+J">Jaeho Lee</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Preprint </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Machine Learning (cs.LG); Sound (cs.SD); Audio and Speech Processing (eess.AS) </div> </div> </dd> <dt> <a name='item15'>[15]</a> <a href ="/abs/2503.16578" title="Abstract" id="2503.16578"> arXiv:2503.16578 </a> (cross-list from cs.CL) [<a href="/pdf/2503.16578" title="Download PDF" id="pdf-2503.16578" aria-labelledby="pdf-2503.16578">pdf</a>, <a href="https://arxiv.org/html/2503.16578v1" title="View HTML" id="html-2503.16578" aria-labelledby="html-2503.16578" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.16578" title="Other formats" id="oth-2503.16578" aria-labelledby="oth-2503.16578">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> SeniorTalk: A Chinese Conversation Dataset with Rich Annotations for Super-Aged Seniors </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+Y">Yang Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+H">Hui Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+S">Shiyao Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+J">Junyang Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=He,+J">Jiabei He</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+J">Jiaming Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+X">Xi Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Y">Yequan Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lin,+Y">Yonghua Lin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Qin,+Y">Yong Qin</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Sound (cs.SD); Audio and Speech Processing (eess.AS) </div> </div> </dd> </dl> <dl id='articles'> <h3>Fri, 21 Mar 2025 (showing 4 of 4 entries )</h3> <dt> <a name='item16'>[16]</a> <a href ="/abs/2503.15627" title="Abstract" id="2503.15627"> arXiv:2503.15627 </a> [<a href="/pdf/2503.15627" title="Download PDF" id="pdf-2503.15627" aria-labelledby="pdf-2503.15627">pdf</a>, <a href="https://arxiv.org/html/2503.15627v1" title="View HTML" id="html-2503.15627" aria-labelledby="html-2503.15627" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.15627" title="Other formats" id="oth-2503.15627" aria-labelledby="oth-2503.15627">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> A Speech Production Model for Radar: Connecting Speech Acoustics with Radar-Measured Vibrations </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&query=Lenz,+I">Isabella Lenz</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Rong,+Y">Yu Rong</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Bliss,+D">Daniel Bliss</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Liss,+J">Julie Liss</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Berisha,+V">Visar Berisha</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 5 pages, 6 figure, InterSpeech Conference </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span>; Signal Processing (eess.SP) </div> </div> </dd> <dt> <a name='item17'>[17]</a> <a href ="/abs/2503.16357" title="Abstract" id="2503.16357"> arXiv:2503.16357 </a> (cross-list from cs.CV) [<a href="/pdf/2503.16357" title="Download PDF" id="pdf-2503.16357" aria-labelledby="pdf-2503.16357">pdf</a>, <a href="https://arxiv.org/html/2503.16357v1" title="View HTML" id="html-2503.16357" aria-labelledby="html-2503.16357" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.16357" title="Other formats" id="oth-2503.16357" aria-labelledby="oth-2503.16357">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> UniSync: A Unified Framework for Audio-Visual Synchronization </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Feng,+T">Tao Feng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xie,+Y">Yifan Xie</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Guan,+X">Xun Guan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Song,+J">Jiyuan Song</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+Z">Zhou Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ma,+F">Fei Ma</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yu,+F">Fei Yu</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 7 pages, 3 figures, accepted by ICME 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Sound (cs.SD); Audio and Speech Processing (eess.AS) </div> </div> </dd> <dt> <a name='item18'>[18]</a> <a href ="/abs/2503.15501" title="Abstract" id="2503.15501"> arXiv:2503.15501 </a> (cross-list from cs.HC) [<a href="/pdf/2503.15501" title="Download PDF" id="pdf-2503.15501" aria-labelledby="pdf-2503.15501">pdf</a>, <a href="https://arxiv.org/html/2503.15501v1" title="View HTML" id="html-2503.15501" aria-labelledby="html-2503.15501" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.15501" title="Other formats" id="oth-2503.15501" aria-labelledby="oth-2503.15501">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Development of an Inclusive Educational Platform Using Open Technologies and Machine Learning: A Case Study on Accessibility Enhancement </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Togni,+J">Jimi Togni</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 14 pages, 1 figure </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Human-Computer Interaction (cs.HC)</span>; Artificial Intelligence (cs.AI); Machine Learning (cs.LG); Audio and Speech Processing (eess.AS) </div> </div> </dd> <dt> <a name='item19'>[19]</a> <a href ="/abs/2503.15498" title="Abstract" id="2503.15498"> arXiv:2503.15498 </a> (cross-list from cs.HC) [<a href="/pdf/2503.15498" title="Download PDF" id="pdf-2503.15498" aria-labelledby="pdf-2503.15498">pdf</a>, <a href="https://arxiv.org/html/2503.15498v1" title="View HTML" id="html-2503.15498" aria-labelledby="html-2503.15498" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.15498" title="Other formats" id="oth-2503.15498" aria-labelledby="oth-2503.15498">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Revival: Collaborative Artistic Creation through Human-AI Interactions in Musical Creativity </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Lee,+K+J+M">Keon Ju M. Lee</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Pasquier,+P">Philippe Pasquier</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yuri,+J">Jun Yuri</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Keon Ju M. Lee, Philippe Pasquier and Jun Yuri. 2024. In Proceedings of the Creativity and Generative AI NIPS (Neural Information Processing Systems) Workshop </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Human-Computer Interaction (cs.HC)</span>; Artificial Intelligence (cs.AI); Multiagent Systems (cs.MA); Multimedia (cs.MM); Sound (cs.SD); Audio and Speech Processing (eess.AS) </div> </div> </dd> </dl> <dl id='articles'> <h3>Thu, 20 Mar 2025 (showing 6 of 6 entries )</h3> <dt> <a name='item20'>[20]</a> <a href ="/abs/2503.15338" title="Abstract" id="2503.15338"> arXiv:2503.15338 </a> [<a href="/pdf/2503.15338" title="Download PDF" id="pdf-2503.15338" aria-labelledby="pdf-2503.15338">pdf</a>, <a href="https://arxiv.org/html/2503.15338v1" title="View HTML" id="html-2503.15338" aria-labelledby="html-2503.15338" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.15338" title="Other formats" id="oth-2503.15338" aria-labelledby="oth-2503.15338">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Solla: Towards a Speech-Oriented LLM That Hears Acoustic Context </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&query=Ao,+J">Junyi Ao</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Chen,+D">Dekun Chen</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Tian,+X">Xiaohai Tian</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Feng,+W">Wenjie Feng</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Zhang,+J">Jun Zhang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Lu,+L">Lu Lu</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Wang,+Y">Yuxuan Wang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Li,+H">Haizhou Li</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Wu,+Z">Zhizheng Wu</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span>; Computation and Language (cs.CL); Sound (cs.SD) </div> </div> </dd> <dt> <a name='item21'>[21]</a> <a href ="/abs/2503.14854" title="Abstract" id="2503.14854"> arXiv:2503.14854 </a> [<a href="/pdf/2503.14854" title="Download PDF" id="pdf-2503.14854" aria-labelledby="pdf-2503.14854">pdf</a>, <a href="https://arxiv.org/html/2503.14854v1" title="View HTML" id="html-2503.14854" aria-labelledby="html-2503.14854" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.14854" title="Other formats" id="oth-2503.14854" aria-labelledby="oth-2503.14854">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Analysis and Extension of Noisy-target Training for Unsupervised Target Signal Enhancement </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&query=Fujimura,+T">Takuya Fujimura</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Toda,+T">Tomoki Toda</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span> </div> </div> </dd> <dt> <a name='item22'>[22]</a> <a href ="/abs/2503.15164" title="Abstract" id="2503.15164"> arXiv:2503.15164 </a> (cross-list from eess.SP) [<a href="/pdf/2503.15164" title="Download PDF" id="pdf-2503.15164" aria-labelledby="pdf-2503.15164">pdf</a>, <a href="https://arxiv.org/html/2503.15164v1" title="View HTML" id="html-2503.15164" aria-labelledby="html-2503.15164" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.15164" title="Other formats" id="oth-2503.15164" aria-labelledby="oth-2503.15164">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Gridless Chirp Parameter Retrieval via Constrained Two-Dimensional Atomic Norm Minimization </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&query=Yang,+D">Dehui Yang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Xi,+F">Feng Xi</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Signal Processing (eess.SP)</span>; Audio and Speech Processing (eess.AS) </div> </div> </dd> <dt> <a name='item23'>[23]</a> <a href ="/abs/2503.15074" title="Abstract" id="2503.15074"> arXiv:2503.15074 </a> (cross-list from cs.SD) [<a href="/pdf/2503.15074" title="Download PDF" id="pdf-2503.15074" aria-labelledby="pdf-2503.15074">pdf</a>, <a href="https://arxiv.org/html/2503.15074v1" title="View HTML" id="html-2503.15074" aria-labelledby="html-2503.15074" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.15074" title="Other formats" id="oth-2503.15074" aria-labelledby="oth-2503.15074">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> InsectSet459: an open dataset of insect sounds for bioacoustic machine learning </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Fai%C3%9F,+M">Marius Fai脽</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ghani,+B">Burooj Ghani</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Stowell,+D">Dan Stowell</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Sound (cs.SD)</span>; Audio and Speech Processing (eess.AS) </div> </div> </dd> <dt> <a name='item24'>[24]</a> <a href ="/abs/2503.14928" title="Abstract" id="2503.14928"> arXiv:2503.14928 </a> (cross-list from cs.CV) [<a href="/pdf/2503.14928" title="Download PDF" id="pdf-2503.14928" aria-labelledby="pdf-2503.14928">pdf</a>, <a href="https://arxiv.org/html/2503.14928v1" title="View HTML" id="html-2503.14928" aria-labelledby="html-2503.14928" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.14928" title="Other formats" id="oth-2503.14928" aria-labelledby="oth-2503.14928">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Shushing! Let's Imagine an Authentic Speech from the Silent Video </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Ye,+J">Jiaxin Ye</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shan,+H">Hongming Shan</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Project Page: <a href="https://imagintalk.github.io" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Artificial Intelligence (cs.AI); Sound (cs.SD); Audio and Speech Processing (eess.AS) </div> </div> </dd> <dt> <a name='item25'>[25]</a> <a href ="/abs/2503.14545" title="Abstract" id="2503.14545"> arXiv:2503.14545 </a> (cross-list from cs.LG) [<a href="/pdf/2503.14545" title="Download PDF" id="pdf-2503.14545" aria-labelledby="pdf-2503.14545">pdf</a>, <a href="https://arxiv.org/html/2503.14545v1" title="View HTML" id="html-2503.14545" aria-labelledby="html-2503.14545" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.14545" title="Other formats" id="oth-2503.14545" aria-labelledby="oth-2503.14545">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> PANDORA: Diffusion Policy Learning for Dexterous Robotic Piano Playing </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+Y">Yanjia Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+R">Renjie Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tu,+Z">Zhengzhong Tu</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Robotics (cs.RO); Sound (cs.SD); Audio and Speech Processing (eess.AS) </div> </div> </dd> </dl> <dl id='articles'> <h3>Wed, 19 Mar 2025 (showing 4 of 4 entries )</h3> <dt> <a name='item26'>[26]</a> <a href ="/abs/2503.14345" title="Abstract" id="2503.14345"> arXiv:2503.14345 </a> [<a href="/pdf/2503.14345" title="Download PDF" id="pdf-2503.14345" aria-labelledby="pdf-2503.14345">pdf</a>, <a href="/format/2503.14345" title="Other formats" id="oth-2503.14345" aria-labelledby="oth-2503.14345">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> MoonCast: High-Quality Zero-Shot Podcast Generation </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&query=Ju,+Z">Zeqian Ju</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Yang,+D">Dongchao Yang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Yu,+J">Jianwei Yu</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Shen,+K">Kai Shen</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Leng,+Y">Yichong Leng</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Wang,+Z">Zhengtao Wang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Tan,+X">Xu Tan</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Zhou,+X">Xinyu Zhou</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Qin,+T">Tao Qin</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Li,+X">Xiangyang Li</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL); Machine Learning (cs.LG); Sound (cs.SD) </div> </div> </dd> <dt> <a name='item27'>[27]</a> <a href ="/abs/2503.14207" title="Abstract" id="2503.14207"> arXiv:2503.14207 </a> [<a href="/pdf/2503.14207" title="Download PDF" id="pdf-2503.14207" aria-labelledby="pdf-2503.14207">pdf</a>, <a href="https://arxiv.org/html/2503.14207v1" title="View HTML" id="html-2503.14207" aria-labelledby="html-2503.14207" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.14207" title="Other formats" id="oth-2503.14207" aria-labelledby="oth-2503.14207">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Room Impulse Response Estimation through Optimal Mass Transport Barycenters </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&query=Pallewela,+R">Rumeshika Pallewela</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Liu,+Y">Yuyang Liu</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Elvander,+F">Filip Elvander</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Submitted to EUSCIPCO 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span>; Signal Processing (eess.SP) </div> </div> </dd> <dt> <a name='item28'>[28]</a> <a href ="/abs/2503.14036" title="Abstract" id="2503.14036"> arXiv:2503.14036 </a> [<a href="/pdf/2503.14036" title="Download PDF" id="pdf-2503.14036" aria-labelledby="pdf-2503.14036">pdf</a>, <a href="https://arxiv.org/html/2503.14036v1" title="View HTML" id="html-2503.14036" aria-labelledby="html-2503.14036" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.14036" title="Other formats" id="oth-2503.14036" aria-labelledby="oth-2503.14036">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Variational Autoencoder for Personalized Pathological Speech Enhancement </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&query=Hou,+M">Mingchi Hou</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Kodrasi,+I">Ina Kodrasi</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Submitted to EUSIPCO 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span>; Sound (cs.SD) </div> </div> </dd> <dt> <a name='item29'>[29]</a> <a href ="/abs/2503.14185" title="Abstract" id="2503.14185"> arXiv:2503.14185 </a> (cross-list from cs.CL) [<a href="/pdf/2503.14185" title="Download PDF" id="pdf-2503.14185" aria-labelledby="pdf-2503.14185">pdf</a>, <a href="https://arxiv.org/html/2503.14185v1" title="View HTML" id="html-2503.14185" aria-labelledby="html-2503.14185" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.14185" title="Other formats" id="oth-2503.14185" aria-labelledby="oth-2503.14185">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> AdaST: Dynamically Adapting Encoder States in the Decoder for End-to-End Speech-to-Text Translation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+W">Wuwei Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+D">Dexin Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xiong,+D">Deyi Xiong</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> ACL 2021 Findings </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Sound (cs.SD); Audio and Speech Processing (eess.AS) </div> </div> </dd> </dl> <div class='paging'>Total of 29 entries </div> <div class='morefewer'>Showing up to 50 entries per page: <a href=/list/eess.AS/recent?skip=0&show=25 rel="nofollow"> fewer</a> | <span style="color: #454545">more</span> | <span style="color: #454545">all</span> </div> </div> </div> </div> </main> <footer style="clear: both;"> <div class="columns is-desktop" role="navigation" aria-label="Secondary" style="margin: -0.75em -0.75em 0.75em -0.75em">  <div class="column" style="padding: 0;"> <div class="columns"> <div class="column"> <ul style="list-style: none; line-height: 2;"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul style="list-style: none; line-height: 2;"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column" style="padding: 0;"> <div class="columns"> <div class="column"> <ul style="list-style: none; line-height: 2;"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul style="list-style: none; line-height: 2;"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>   </div> </footer> </div> <script src="/static/base/1.0.1/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Audio and Speech Processing