CINXE.COM
Audio and Speech Processing
<!DOCTYPE html> <html lang="en"> <head> <title>Audio and Speech Processing </title> <meta name="viewport" content="width=device-width, initial-scale=1"> <link rel="apple-touch-icon" sizes="180x180" href="/static/browse/0.3.4/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="/static/browse/0.3.4/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="/static/browse/0.3.4/images/icons/favicon-16x16.png"> <link rel="manifest" href="/static/browse/0.3.4/images/icons/site.webmanifest"> <link rel="mask-icon" href="/static/browse/0.3.4/images/icons/safari-pinned-tab.svg" color="#5bbad5"> <meta name="msapplication-TileColor" content="#da532c"> <meta name="theme-color" content="#ffffff"> <link rel="stylesheet" type="text/css" media="screen" href="/static/browse/0.3.4/css/arXiv.css?v=20241206" /> <link rel="stylesheet" type="text/css" media="print" href="/static/browse/0.3.4/css/arXiv-print.css?v=20200611" /> <link rel="stylesheet" type="text/css" media="screen" href="/static/browse/0.3.4/css/browse_search.css" /> <script language="javascript" src="/static/browse/0.3.4/js/accordion.js" /></script> <script src="/static/browse/0.3.4/js/mathjaxToggle.min.js" type="text/javascript"></script> <script type="text/javascript" language="javascript">mathjaxToggle();</script> </head> <body class="with-cu-identity"> <div class="flex-wrap-footer"> <header> <a href="#content" class="is-sr-only">Skip to main content</a> <!-- start desktop header --> <div class="columns is-vcentered is-hidden-mobile" id="cu-identity"> <div class="column" id="cu-logo"> <a href="https://www.cornell.edu/"><img src="/static/browse/0.3.4/images/icons/cu/cornell-reduced-white-SMALL.svg" alt="Cornell University" /></a> </div><div class="column" id="support-ack"> <span id="support-ack-url">We gratefully acknowledge support from the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors.</span> <a href="https://info.arxiv.org/about/donate.html" class="btn-header-donate">Donate</a> </div> </div> <div id="header" class="is-hidden-mobile"> <a aria-hidden="true" tabindex="-1" href="/IgnoreMe"></a> <div class="header-breadcrumbs"> <a href="/"><img src="/static/browse/0.3.4/images/arxiv-logo-one-color-white.svg" alt="arxiv logo" style="height:40px;"/></a> <span>></span> <a href="/list/eess.AS/recent">eess.AS</a> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div><!-- /end desktop header --> <div class="mobile-header"> <div class="columns is-mobile"> <div class="column logo-arxiv"><a href="https://arxiv.org/"><img src="/static/browse/0.3.4/images/arxiv-logomark-small-white.svg" alt="arXiv logo" style="height:60px;" /></a></div> <div class="column logo-cornell"><a href="https://www.cornell.edu/"> <picture> <source media="(min-width: 501px)" srcset="/static/browse/0.3.4/images/icons/cu/cornell-reduced-white-SMALL.svg 400w" sizes="400w" /> <source srcset="/static/browse/0.3.4/images/icons/cu/cornell_seal_simple_black.svg 2x" /> <img src="/static/browse/0.3.4/images/icons/cu/cornell-reduced-white-SMALL.svg" alt="Cornell University Logo" /> </picture> </a></div> <div class="column nav" id="toggle-container" role="menubar"> <button class="toggle-control"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-white"><title>open search</title><path d="M505 442.7L405.3 343c-4.5-4.5-10.6-7-17-7H372c27.6-35.3 44-79.7 44-128C416 93.1 322.9 0 208 0S0 93.1 0 208s93.1 208 208 208c48.3 0 92.7-16.4 128-44v16.3c0 6.4 2.5 12.5 7 17l99.7 99.7c9.4 9.4 24.6 9.4 33.9 0l28.3-28.3c9.4-9.4 9.4-24.6.1-34zM208 336c-70.7 0-128-57.2-128-128 0-70.7 57.2-128 128-128 70.7 0 128 57.2 128 128 0 70.7-57.2 128-128 128z"/></svg></button> <div class="mobile-toggle-block toggle-target"> <form class="mobile-search-form" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <input class="input" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <input type="hidden" name="source" value="header"> <input type="hidden" name="searchtype" value="all"> <button class="button">GO</button> </div> </form> </div> <button class="toggle-control"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-white" role="menu"><title>open navigation menu</title><path d="M16 132h416c8.837 0 16-7.163 16-16V76c0-8.837-7.163-16-16-16H16C7.163 60 0 67.163 0 76v40c0 8.837 7.163 16 16 16zm0 160h416c8.837 0 16-7.163 16-16v-40c0-8.837-7.163-16-16-16H16c-8.837 0-16 7.163-16 16v40c0 8.837 7.163 16 16 16zm0 160h416c8.837 0 16-7.163 16-16v-40c0-8.837-7.163-16-16-16H16c-8.837 0-16 7.163-16 16v40c0 8.837 7.163 16 16 16z"/ ></svg></button> <div class="mobile-toggle-block toggle-target"> <nav class="mobile-menu" aria-labelledby="mobilemenulabel"> <h2 id="mobilemenulabel">quick links</h2> <ul> <li><a href="https://arxiv.org/login">Login</a></li> <li><a href="https://info.arxiv.org/help">Help Pages</a></li> <li><a href="https://info.arxiv.org/about">About</a></li> </ul> </nav> </div> </div> </div> </div><!-- /end mobile-header --> </header> <main> <div id="content"> <div id='content-inner'> <div id='dlpage'> <h1>Audio and Speech Processing</h1> <h2>Authors and titles for recent submissions</h2> <ul> <li> <a href="/list/eess.AS/recent?skip=0&show=50"> Fri, 28 Mar 2025 </a> </li><li> <a href="/list/eess.AS/recent?skip=5&show=50"> Thu, 27 Mar 2025 </a> </li><li> <a href="/list/eess.AS/recent?skip=11&show=50"> Wed, 26 Mar 2025 </a> </li><li> <a href="/list/eess.AS/recent?skip=14&show=50"> Tue, 25 Mar 2025 </a> </li><li> <a href="/list/eess.AS/recent?skip=25&show=50"> Mon, 24 Mar 2025 </a> </li></ul> <p>See today's <a id="new-eess.AS" aria-labelledby="new-eess.AS" href="/list/eess.AS/new">new</a> changes</p> <div class='paging'>Total of 29 entries </div> <div class='morefewer'>Showing up to 50 entries per page: <a href=/list/eess.AS/recent?skip=15&show=25 rel="nofollow"> fewer</a> | <span style="color: #454545">more</span> | <span style="color: #454545">all</span> </div> <dl id='articles'> <h3>Tue, 25 Mar 2025 (continued, showing last 10 of 11 entries )</h3> <dt> <a name='item16'>[16]</a> <a href ="/abs/2503.18590" title="Abstract" id="2503.18590"> arXiv:2503.18590 </a> [<a href="/pdf/2503.18590" title="Download PDF" id="pdf-2503.18590" aria-labelledby="pdf-2503.18590">pdf</a>, <a href="https://arxiv.org/html/2503.18590v1" title="View HTML" id="html-2503.18590" aria-labelledby="html-2503.18590" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.18590" title="Other formats" id="oth-2503.18590" aria-labelledby="oth-2503.18590">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Target Speaker Selection for Neural Network Beamforming in Multi-Speaker Scenarios </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&query=Fiorio,+L+V">Luan Vin铆cius Fiorio</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Defraene,+B">Bruno Defraene</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=David,+J">Johan David</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Young,+A">Alex Young</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Widdershoven,+F">Frans Widdershoven</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=van+Houtum,+W">Wim van Houtum</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Aarts,+R+M">Ronald M. Aarts</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span>; Signal Processing (eess.SP) </div> </div> </dd> <dt> <a name='item17'>[17]</a> <a href ="/abs/2503.18579" title="Abstract" id="2503.18579"> arXiv:2503.18579 </a> [<a href="/pdf/2503.18579" title="Download PDF" id="pdf-2503.18579" aria-labelledby="pdf-2503.18579">pdf</a>, <a href="https://arxiv.org/html/2503.18579v1" title="View HTML" id="html-2503.18579" aria-labelledby="html-2503.18579" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.18579" title="Other formats" id="oth-2503.18579" aria-labelledby="oth-2503.18579">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Unsupervised Variational Acoustic Clustering </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&query=Fiorio,+L+V">Luan Vin铆cius Fiorio</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Defraene,+B">Bruno Defraene</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=David,+J">Johan David</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Widdershoven,+F">Frans Widdershoven</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=van+Houtum,+W">Wim van Houtum</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Aarts,+R+M">Ronald M. Aarts</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span>; Signal Processing (eess.SP) </div> </div> </dd> <dt> <a name='item18'>[18]</a> <a href ="/abs/2503.18022" title="Abstract" id="2503.18022"> arXiv:2503.18022 </a> [<a href="/pdf/2503.18022" title="Download PDF" id="pdf-2503.18022" aria-labelledby="pdf-2503.18022">pdf</a>, <a href="https://arxiv.org/html/2503.18022v1" title="View HTML" id="html-2503.18022" aria-labelledby="html-2503.18022" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.18022" title="Other formats" id="oth-2503.18022" aria-labelledby="oth-2503.18022">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> A State-of-the-Art Review on Acoustic Preservation of Historical Worship Spaces through Auralization </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&query=Rosseel,+H">Hannes Rosseel</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=van+Waterschoot,+T">Toon van Waterschoot</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 32 pages, 7 figures, 4 tables, Published in Signal Processing </div> <div class='list-journal-ref'><span class='descriptor'>Journal-ref:</span> Signal Processing, vol. 234, p. 1-25, 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span>; Sound (cs.SD); Signal Processing (eess.SP) </div> </div> </dd> <dt> <a name='item19'>[19]</a> <a href ="/abs/2503.18928" title="Abstract" id="2503.18928"> arXiv:2503.18928 </a> (cross-list from cs.SD) [<a href="/pdf/2503.18928" title="Download PDF" id="pdf-2503.18928" aria-labelledby="pdf-2503.18928">pdf</a>, <a href="/format/2503.18928" title="Other formats" id="oth-2503.18928" aria-labelledby="oth-2503.18928">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> A Reliable and Efficient Detection Pipeline for Rodent Ultrasonic Vocalizations </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Anis,+S+S">Sabah Shahnoor Anis</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kellis,+D+M">Devin M. Kellis</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kaigler,+K+F">Kris Ford Kaigler</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wilson,+M+A">Marlene A. Wilson</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=O'Reilly,+C">Christian O'Reilly</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted for publication in the proceeding of the 7th International Conference on Advances in Signal Processing and Artificial Intelligence (ASPAI' 2025), 8-10 April 2025, Innsbruck, Austria </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Sound (cs.SD)</span>; Audio and Speech Processing (eess.AS) </div> </div> </dd> <dt> <a name='item20'>[20]</a> <a href ="/abs/2503.18880" title="Abstract" id="2503.18880"> arXiv:2503.18880 </a> (cross-list from cs.CV) [<a href="/pdf/2503.18880" title="Download PDF" id="pdf-2503.18880" aria-labelledby="pdf-2503.18880">pdf</a>, <a href="https://arxiv.org/html/2503.18880v1" title="View HTML" id="html-2503.18880" aria-labelledby="html-2503.18880" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.18880" title="Other formats" id="oth-2503.18880" aria-labelledby="oth-2503.18880">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Seeing Speech and Sound: Distinguishing and Locating Audios in Visual Scenes </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Ryu,+H">Hyeonggon Ryu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kim,+S">Seongyu Kim</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chung,+J+S">Joon Son Chung</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Senocak,+A">Arda Senocak</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> CVPR 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Sound (cs.SD); Audio and Speech Processing (eess.AS) </div> </div> </dd> <dt> <a name='item21'>[21]</a> <a href ="/abs/2503.18698" title="Abstract" id="2503.18698"> arXiv:2503.18698 </a> (cross-list from cs.SD) [<a href="/pdf/2503.18698" title="Download PDF" id="pdf-2503.18698" aria-labelledby="pdf-2503.18698">pdf</a>, <a href="https://arxiv.org/html/2503.18698v1" title="View HTML" id="html-2503.18698" aria-labelledby="html-2503.18698" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.18698" title="Other formats" id="oth-2503.18698" aria-labelledby="oth-2503.18698">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Wireless Hearables With Programmable Speech AI Accelerators </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Itani,+M">Malek Itani</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+T">Tuochao Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Raghavan,+A">Arun Raghavan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kohlberg,+G">Gavriel Kohlberg</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gollakota,+S">Shyamnath Gollakota</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Sound (cs.SD)</span>; Audio and Speech Processing (eess.AS) </div> </div> </dd> <dt> <a name='item22'>[22]</a> <a href ="/abs/2503.18486" title="Abstract" id="2503.18486"> arXiv:2503.18486 </a> (cross-list from cs.SD) [<a href="/pdf/2503.18486" title="Download PDF" id="pdf-2503.18486" aria-labelledby="pdf-2503.18486">pdf</a>, <a href="https://arxiv.org/html/2503.18486v1" title="View HTML" id="html-2503.18486" aria-labelledby="html-2503.18486" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.18486" title="Other formats" id="oth-2503.18486" aria-labelledby="oth-2503.18486">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Music Similarity Representation Learning Focusing on Individual Instruments with Source Separation and Human Preference </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Imamura,+T">Takehiro Imamura</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hashizume,+Y">Yuka Hashizume</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+W">Wen-Chin Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Toda,+T">Tomoki Toda</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Sound (cs.SD)</span>; Audio and Speech Processing (eess.AS) </div> </div> </dd> <dt> <a name='item23'>[23]</a> <a href ="/abs/2503.17886" title="Abstract" id="2503.17886"> arXiv:2503.17886 </a> (cross-list from cs.SD) [<a href="/pdf/2503.17886" title="Download PDF" id="pdf-2503.17886" aria-labelledby="pdf-2503.17886">pdf</a>, <a href="https://arxiv.org/html/2503.17886v1" title="View HTML" id="html-2503.17886" aria-labelledby="html-2503.17886" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.17886" title="Other formats" id="oth-2503.17886" aria-labelledby="oth-2503.17886">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Elevating Robust Multi-Talker ASR by Decoupling Speaker Separation and Speech Recognition </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+Y">Yufeng Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Taherian,+H">Hassan Taherian</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kalkhorani,+V+A">Vahid Ahmadi Kalkhorani</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+D">DeLiang Wang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Sound (cs.SD)</span>; Audio and Speech Processing (eess.AS) </div> </div> </dd> <dt> <a name='item24'>[24]</a> <a href ="/abs/2503.17634" title="Abstract" id="2503.17634"> arXiv:2503.17634 </a> (cross-list from eess.SY) [<a href="/pdf/2503.17634" title="Download PDF" id="pdf-2503.17634" aria-labelledby="pdf-2503.17634">pdf</a>, <a href="https://arxiv.org/html/2503.17634v1" title="View HTML" id="html-2503.17634" aria-labelledby="html-2503.17634" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.17634" title="Other formats" id="oth-2503.17634" aria-labelledby="oth-2503.17634">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Mixed-gradients Distributed Filtered Reference Least Mean Square Algorithm -- A Robust Distributed Multichannel Active Noise Control Algorithm </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&query=Ji,+J">Junwei Ji</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Shi,+D">Dongyuan Shi</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Gan,+W">Woon-Seng Gan</a></div> <div class='list-journal-ref'><span class='descriptor'>Journal-ref:</span> IEEE Transactions on Audio, Speech and Language Processing,2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Systems and Control (eess.SY)</span>; Audio and Speech Processing (eess.AS); Signal Processing (eess.SP) </div> </div> </dd> <dt> <a name='item25'>[25]</a> <a href ="/abs/2503.17551" title="Abstract" id="2503.17551"> arXiv:2503.17551 </a> (cross-list from cs.MM) [<a href="/pdf/2503.17551" title="Download PDF" id="pdf-2503.17551" aria-labelledby="pdf-2503.17551">pdf</a>, <a href="https://arxiv.org/html/2503.17551v1" title="View HTML" id="html-2503.17551" aria-labelledby="html-2503.17551" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.17551" title="Other formats" id="oth-2503.17551" aria-labelledby="oth-2503.17551">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Audio-Enhanced Vision-Language Modeling with Latent Space Broadening for High Quality Data Expansion </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Sun,+Y">Yu Sun</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+Y">Yin Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sun,+R">Ruixiao Sun</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+C">Chunhui Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+F">Fangming Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jin,+Z">Ze Jin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+L">Linjie Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shen,+X">Xiang Shen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hao,+Z">Zhuolin Hao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xiong,+H">Hongyu Xiong</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Multimedia (cs.MM)</span>; Artificial Intelligence (cs.AI); Computer Vision and Pattern Recognition (cs.CV); Sound (cs.SD); Audio and Speech Processing (eess.AS) </div> </div> </dd> </dl> <dl id='articles'> <h3>Mon, 24 Mar 2025 (showing 4 of 4 entries )</h3> <dt> <a name='item26'>[26]</a> <a href ="/abs/2503.16956" title="Abstract" id="2503.16956"> arXiv:2503.16956 </a> [<a href="/pdf/2503.16956" title="Download PDF" id="pdf-2503.16956" aria-labelledby="pdf-2503.16956">pdf</a>, <a href="https://arxiv.org/html/2503.16956v1" title="View HTML" id="html-2503.16956" aria-labelledby="html-2503.16956" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.16956" title="Other formats" id="oth-2503.16956" aria-labelledby="oth-2503.16956">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> From Faces to Voices: Learning Hierarchical Representations for High-quality Video-to-Speech </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&query=Kim,+J">Ji-Hoon Kim</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Choi,+J">Jeongsoo Choi</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Kim,+J">Jaehun Kim</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Jung,+C">Chaeyoung Jung</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Chung,+J+S">Joon Son Chung</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> CVPR 2025, demo page: <a href="https://mm.kaist.ac.kr/projects/faces2voices/" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span>; Artificial Intelligence (cs.AI); Computer Vision and Pattern Recognition (cs.CV); Sound (cs.SD) </div> </div> </dd> <dt> <a name='item27'>[27]</a> <a href ="/abs/2503.17281" title="Abstract" id="2503.17281"> arXiv:2503.17281 </a> (cross-list from cs.SD) [<a href="/pdf/2503.17281" title="Download PDF" id="pdf-2503.17281" aria-labelledby="pdf-2503.17281">pdf</a>, <a href="https://arxiv.org/html/2503.17281v1" title="View HTML" id="html-2503.17281" aria-labelledby="html-2503.17281" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.17281" title="Other formats" id="oth-2503.17281" aria-labelledby="oth-2503.17281">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Learning disentangled representations for instrument-based music similarity </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Hashizume,+Y">Yuka Hashizume</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+L">Li Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Miyashita,+A">Atsushi Miyashita</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Toda,+T">Tomoki Toda</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> arXiv admin note: text overlap with <a href="https://arxiv.org/abs/2404.06682" data-arxiv-id="2404.06682" class="link-https">arXiv:2404.06682</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Sound (cs.SD)</span>; Audio and Speech Processing (eess.AS) </div> </div> </dd> <dt> <a name='item28'>[28]</a> <a href ="/abs/2503.16853" title="Abstract" id="2503.16853"> arXiv:2503.16853 </a> (cross-list from cs.CL) [<a href="/pdf/2503.16853" title="Download PDF" id="pdf-2503.16853" aria-labelledby="pdf-2503.16853">pdf</a>, <a href="https://arxiv.org/html/2503.16853v1" title="View HTML" id="html-2503.16853" aria-labelledby="html-2503.16853" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.16853" title="Other formats" id="oth-2503.16853" aria-labelledby="oth-2503.16853">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Imagine to Hear: Auditory Knowledge Generation can be an Effective Assistant for Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Yoo,+S">Suho Yoo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ok,+H">Hyunjong Ok</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lee,+J">Jaeho Lee</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Preprint </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Machine Learning (cs.LG); Sound (cs.SD); Audio and Speech Processing (eess.AS) </div> </div> </dd> <dt> <a name='item29'>[29]</a> <a href ="/abs/2503.16578" title="Abstract" id="2503.16578"> arXiv:2503.16578 </a> (cross-list from cs.CL) [<a href="/pdf/2503.16578" title="Download PDF" id="pdf-2503.16578" aria-labelledby="pdf-2503.16578">pdf</a>, <a href="https://arxiv.org/html/2503.16578v1" title="View HTML" id="html-2503.16578" aria-labelledby="html-2503.16578" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.16578" title="Other formats" id="oth-2503.16578" aria-labelledby="oth-2503.16578">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> SeniorTalk: A Chinese Conversation Dataset with Rich Annotations for Super-Aged Seniors </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+Y">Yang Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+H">Hui Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+S">Shiyao Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+J">Junyang Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=He,+J">Jiabei He</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+J">Jiaming Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+X">Xi Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Y">Yequan Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lin,+Y">Yonghua Lin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Qin,+Y">Yong Qin</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Sound (cs.SD); Audio and Speech Processing (eess.AS) </div> </div> </dd> </dl> <div class='paging'>Total of 29 entries </div> <div class='morefewer'>Showing up to 50 entries per page: <a href=/list/eess.AS/recent?skip=15&show=25 rel="nofollow"> fewer</a> | <span style="color: #454545">more</span> | <span style="color: #454545">all</span> </div> </div> </div> </div> </main> <footer style="clear: both;"> <div class="columns is-desktop" role="navigation" aria-label="Secondary" style="margin: -0.75em -0.75em 0.75em -0.75em"> <!-- Macro-Column 1 --> <div class="column" style="padding: 0;"> <div class="columns"> <div class="column"> <ul style="list-style: none; line-height: 2;"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul style="list-style: none; line-height: 2;"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- End Macro-Column 1 --> <!-- Macro-Column 2 --> <div class="column" style="padding: 0;"> <div class="columns"> <div class="column"> <ul style="list-style: none; line-height: 2;"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul style="list-style: none; line-height: 2;"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> <!-- End Macro-Column 2 --> </div> </footer> </div> <script src="/static/base/1.0.1/js/member_acknowledgement.js"></script> </body> </html>