Sound

<!DOCTYPE html> <html lang="en"> <head> <title>Sound </title> <meta name="viewport" content="width=device-width, initial-scale=1"> <link rel="apple-touch-icon" sizes="180x180" href="/static/browse/0.3.4/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="/static/browse/0.3.4/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="/static/browse/0.3.4/images/icons/favicon-16x16.png"> <link rel="manifest" href="/static/browse/0.3.4/images/icons/site.webmanifest"> <link rel="mask-icon" href="/static/browse/0.3.4/images/icons/safari-pinned-tab.svg" color="#5bbad5"> <meta name="msapplication-TileColor" content="#da532c"> <meta name="theme-color" content="#ffffff"> <link rel="stylesheet" type="text/css" media="screen" href="/static/browse/0.3.4/css/arXiv.css?v=20241206" /> <link rel="stylesheet" type="text/css" media="print" href="/static/browse/0.3.4/css/arXiv-print.css?v=20200611" /> <link rel="stylesheet" type="text/css" media="screen" href="/static/browse/0.3.4/css/browse_search.css" /> <script language="javascript" src="/static/browse/0.3.4/js/accordion.js" /></script> <script src="/static/browse/0.3.4/js/mathjaxToggle.min.js" type="text/javascript"></script> <script type="text/javascript" language="javascript">mathjaxToggle();</script> </head> <body class="with-cu-identity"> <div class="flex-wrap-footer"> <header> <a href="#content" class="is-sr-only">Skip to main content</a>  <div class="columns is-vcentered is-hidden-mobile" id="cu-identity"> <div class="column" id="cu-logo"> <a href="https://www.cornell.edu/"><img src="/static/browse/0.3.4/images/icons/cu/cornell-reduced-white-SMALL.svg" alt="Cornell University" /></a> </div><div class="column" id="support-ack"> <span id="support-ack-url">We gratefully acknowledge support from the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors.</span> <a href="https://info.arxiv.org/about/donate.html" class="btn-header-donate">Donate</a> </div> </div> <div id="header" class="is-hidden-mobile"> <a aria-hidden="true" tabindex="-1" href="/IgnoreMe"></a> <div class="header-breadcrumbs"> <a href="/"><img src="/static/browse/0.3.4/images/arxiv-logo-one-color-white.svg" alt="arxiv logo" style="height:40px;"/></a> <span>></span> <a href="/list/cs.SD/recent">cs.SD</a> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <div class="mobile-header"> <div class="columns is-mobile"> <div class="column logo-arxiv"><a href="https://arxiv.org/"><img src="/static/browse/0.3.4/images/arxiv-logomark-small-white.svg" alt="arXiv logo" style="height:60px;" /></a></div> <div class="column logo-cornell"><a href="https://www.cornell.edu/"> <picture> <source media="(min-width: 501px)" srcset="/static/browse/0.3.4/images/icons/cu/cornell-reduced-white-SMALL.svg 400w" sizes="400w" /> <source srcset="/static/browse/0.3.4/images/icons/cu/cornell_seal_simple_black.svg 2x" /> <img src="/static/browse/0.3.4/images/icons/cu/cornell-reduced-white-SMALL.svg" alt="Cornell University Logo" /> </picture> </a></div> <div class="column nav" id="toggle-container" role="menubar"> <button class="toggle-control"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-white"><title>open search</title><path d="M505 442.7L405.3 343c-4.5-4.5-10.6-7-17-7H372c27.6-35.3 44-79.7 44-128C416 93.1 322.9 0 208 0S0 93.1 0 208s93.1 208 208 208c48.3 0 92.7-16.4 128-44v16.3c0 6.4 2.5 12.5 7 17l99.7 99.7c9.4 9.4 24.6 9.4 33.9 0l28.3-28.3c9.4-9.4 9.4-24.6.1-34zM208 336c-70.7 0-128-57.2-128-128 0-70.7 57.2-128 128-128 70.7 0 128 57.2 128 128 0 70.7-57.2 128-128 128z"/></svg></button> <div class="mobile-toggle-block toggle-target"> <form class="mobile-search-form" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <input class="input" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <input type="hidden" name="source" value="header"> <input type="hidden" name="searchtype" value="all"> <button class="button">GO</button> </div> </form> </div> <button class="toggle-control"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-white" role="menu"><title>open navigation menu</title><path d="M16 132h416c8.837 0 16-7.163 16-16V76c0-8.837-7.163-16-16-16H16C7.163 60 0 67.163 0 76v40c0 8.837 7.163 16 16 16zm0 160h416c8.837 0 16-7.163 16-16v-40c0-8.837-7.163-16-16-16H16c-8.837 0-16 7.163-16 16v40c0 8.837 7.163 16 16 16zm0 160h416c8.837 0 16-7.163 16-16v-40c0-8.837-7.163-16-16-16H16c-8.837 0-16 7.163-16 16v40c0 8.837 7.163 16 16 16z"/ ></svg></button> <div class="mobile-toggle-block toggle-target"> <nav class="mobile-menu" aria-labelledby="mobilemenulabel"> <h2 id="mobilemenulabel">quick links</h2> <ul> <li><a href="https://arxiv.org/login">Login</a></li> <li><a href="https://info.arxiv.org/help">Help Pages</a></li> <li><a href="https://info.arxiv.org/about">About</a></li> </ul> </nav> </div> </div> </div> </div> </header> <main> <div id="content"> <div id='content-inner'> <div id='dlpage'> <h1>Sound</h1> <h2>Authors and titles for recent submissions</h2> <ul> <li> <a href="/list/cs.SD/recent?skip=0&show=50"> Fri, 21 Mar 2025 </a> </li><li> <a href="/list/cs.SD/recent?skip=4&show=50"> Thu, 20 Mar 2025 </a> </li><li> <a href="/list/cs.SD/recent?skip=10&show=50"> Wed, 19 Mar 2025 </a> </li><li> <a href="/list/cs.SD/recent?skip=15&show=50"> Tue, 18 Mar 2025 </a> </li><li> <a href="/list/cs.SD/recent?skip=28&show=50"> Mon, 17 Mar 2025 </a> </li></ul> <p>See today's <a id="new-cs.SD" aria-labelledby="new-cs.SD" href="/list/cs.SD/new">new</a> changes</p> <div class='paging'>Total of 39 entries </div> <div class='morefewer'>Showing up to 50 entries per page: <a href=/list/cs.SD/recent?skip=0&show=25 rel="nofollow"> fewer</a> | <span style="color: #454545">more</span> | <span style="color: #454545">all</span> </div> <dl id='articles'> <h3>Fri, 21 Mar 2025 (showing 4 of 4 entries )</h3> <dt> <a name='item1'>[1]</a> <a href ="/abs/2503.15576" title="Abstract" id="2503.15576"> arXiv:2503.15576 </a> [<a href="/pdf/2503.15576" title="Download PDF" id="pdf-2503.15576" aria-labelledby="pdf-2503.15576">pdf</a>, <a href="https://arxiv.org/html/2503.15576v1" title="View HTML" id="html-2503.15576" aria-labelledby="html-2503.15576" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.15576" title="Other formats" id="oth-2503.15576" aria-labelledby="oth-2503.15576">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> A Bird Song Detector for improving bird identification through Deep Learning: a case study from Do帽ana </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=M%C3%A1rquez-Rodr%C3%ADguez,+A">Alba M谩rquez-Rodr铆guez</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mohedano-Munoz,+M+%C3%81">Miguel 脕ngel Mohedano-Munoz</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mar%C3%ADn-Jim%C3%A9nez,+M+J">Manuel J. Mar铆n-Jim茅nez</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Santamar%C3%ADa-Garc%C3%ADa,+E">Eduardo Santamar铆a-Garc铆a</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bastianelli,+G">Giulia Bastianelli</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jordano,+P">Pedro Jordano</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mendoza,+I">Irene Mendoza</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 20 pages, 13 images, for associated dataset see <a href="https://huggingface.co/datasets/GrunCrow/BIRDeep_AudioAnnotations" rel="external noopener nofollow" class="link-external link-https">this https URL</a> , for associated code see <a href="https://github.com/GrunCrow/BIRDeep_BirdSongDetector_NeuralNetworks" rel="external noopener nofollow" class="link-external link-https">this https URL</a> and <a href="https://github.com/GrunCrow/Bird-Song-Detector" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Sound (cs.SD)</span>; Artificial Intelligence (cs.AI); Computer Vision and Pattern Recognition (cs.CV); Machine Learning (cs.LG); Neural and Evolutionary Computing (cs.NE) </div> </div> </dd> <dt> <a name='item2'>[2]</a> <a href ="/abs/2503.16357" title="Abstract" id="2503.16357"> arXiv:2503.16357 </a> (cross-list from cs.CV) [<a href="/pdf/2503.16357" title="Download PDF" id="pdf-2503.16357" aria-labelledby="pdf-2503.16357">pdf</a>, <a href="https://arxiv.org/html/2503.16357v1" title="View HTML" id="html-2503.16357" aria-labelledby="html-2503.16357" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.16357" title="Other formats" id="oth-2503.16357" aria-labelledby="oth-2503.16357">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> UniSync: A Unified Framework for Audio-Visual Synchronization </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Feng,+T">Tao Feng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xie,+Y">Yifan Xie</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Guan,+X">Xun Guan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Song,+J">Jiyuan Song</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+Z">Zhou Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ma,+F">Fei Ma</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yu,+F">Fei Yu</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 7 pages, 3 figures, accepted by ICME 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Sound (cs.SD); Audio and Speech Processing (eess.AS) </div> </div> </dd> <dt> <a name='item3'>[3]</a> <a href ="/abs/2503.16311" title="Abstract" id="2503.16311"> arXiv:2503.16311 </a> (cross-list from cs.LG) [<a href="/pdf/2503.16311" title="Download PDF" id="pdf-2503.16311" aria-labelledby="pdf-2503.16311">pdf</a>, <a href="https://arxiv.org/html/2503.16311v1" title="View HTML" id="html-2503.16311" aria-labelledby="html-2503.16311" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.16311" title="Other formats" id="oth-2503.16311" aria-labelledby="oth-2503.16311">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Structured-Noise Masked Modeling for Video, Audio and Beyond </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Bhowmik,+A">Aritra Bhowmik</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Thoker,+F+M">Fida Mohammad Thoker</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hinojosa,+C">Carlos Hinojosa</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ghanem,+B">Bernard Ghanem</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Snoek,+C+G+M">Cees G. M. Snoek</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Artificial Intelligence (cs.AI); Sound (cs.SD) </div> </div> </dd> <dt> <a name='item4'>[4]</a> <a href ="/abs/2503.15498" title="Abstract" id="2503.15498"> arXiv:2503.15498 </a> (cross-list from cs.HC) [<a href="/pdf/2503.15498" title="Download PDF" id="pdf-2503.15498" aria-labelledby="pdf-2503.15498">pdf</a>, <a href="https://arxiv.org/html/2503.15498v1" title="View HTML" id="html-2503.15498" aria-labelledby="html-2503.15498" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.15498" title="Other formats" id="oth-2503.15498" aria-labelledby="oth-2503.15498">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Revival: Collaborative Artistic Creation through Human-AI Interactions in Musical Creativity </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Lee,+K+J+M">Keon Ju M. Lee</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Pasquier,+P">Philippe Pasquier</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yuri,+J">Jun Yuri</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Keon Ju M. Lee, Philippe Pasquier and Jun Yuri. 2024. In Proceedings of the Creativity and Generative AI NIPS (Neural Information Processing Systems) Workshop </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Human-Computer Interaction (cs.HC)</span>; Artificial Intelligence (cs.AI); Multiagent Systems (cs.MA); Multimedia (cs.MM); Sound (cs.SD); Audio and Speech Processing (eess.AS) </div> </div> </dd> </dl> <dl id='articles'> <h3>Thu, 20 Mar 2025 (showing 6 of 6 entries )</h3> <dt> <a name='item5'>[5]</a> <a href ="/abs/2503.15074" title="Abstract" id="2503.15074"> arXiv:2503.15074 </a> [<a href="/pdf/2503.15074" title="Download PDF" id="pdf-2503.15074" aria-labelledby="pdf-2503.15074">pdf</a>, <a href="https://arxiv.org/html/2503.15074v1" title="View HTML" id="html-2503.15074" aria-labelledby="html-2503.15074" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.15074" title="Other formats" id="oth-2503.15074" aria-labelledby="oth-2503.15074">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> InsectSet459: an open dataset of insect sounds for bioacoustic machine learning </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Fai%C3%9F,+M">Marius Fai脽</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ghani,+B">Burooj Ghani</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Stowell,+D">Dan Stowell</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Sound (cs.SD)</span>; Audio and Speech Processing (eess.AS) </div> </div> </dd> <dt> <a name='item6'>[6]</a> <a href ="/abs/2503.15338" title="Abstract" id="2503.15338"> arXiv:2503.15338 </a> (cross-list from eess.AS) [<a href="/pdf/2503.15338" title="Download PDF" id="pdf-2503.15338" aria-labelledby="pdf-2503.15338">pdf</a>, <a href="https://arxiv.org/html/2503.15338v1" title="View HTML" id="html-2503.15338" aria-labelledby="html-2503.15338" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.15338" title="Other formats" id="oth-2503.15338" aria-labelledby="oth-2503.15338">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Solla: Towards a Speech-Oriented LLM That Hears Acoustic Context </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&query=Ao,+J">Junyi Ao</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Chen,+D">Dekun Chen</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Tian,+X">Xiaohai Tian</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Feng,+W">Wenjie Feng</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Zhang,+J">Jun Zhang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Lu,+L">Lu Lu</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Wang,+Y">Yuxuan Wang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Li,+H">Haizhou Li</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Wu,+Z">Zhizheng Wu</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span>; Computation and Language (cs.CL); Sound (cs.SD) </div> </div> </dd> <dt> <a name='item7'>[7]</a> <a href ="/abs/2503.15124" title="Abstract" id="2503.15124"> arXiv:2503.15124 </a> (cross-list from cs.HC) [<a href="/pdf/2503.15124" title="Download PDF" id="pdf-2503.15124" aria-labelledby="pdf-2503.15124">pdf</a>, <a href="https://arxiv.org/html/2503.15124v1" title="View HTML" id="html-2503.15124" aria-labelledby="html-2503.15124" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.15124" title="Other formats" id="oth-2503.15124" aria-labelledby="oth-2503.15124">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Evaluating ASR Confidence Scores for Automated Error Detection in User-Assisted Correction Interfaces </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Kuhn,+K">Korbinian Kuhn</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kersken,+V">Verena Kersken</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zimmermann,+G">Gottfried Zimmermann</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 7 pages, 1 figure, to be published in Extended Abstracts of the CHI Conference on Human Factors in Computing Systems (CHI EA '25) </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Human-Computer Interaction (cs.HC)</span>; Computation and Language (cs.CL); Sound (cs.SD); Audio and Speech Processing (eess.AS) </div> </div> </dd> <dt> <a name='item8'>[8]</a> <a href ="/abs/2503.15120" title="Abstract" id="2503.15120"> arXiv:2503.15120 </a> (cross-list from cs.HC) [<a href="/pdf/2503.15120" title="Download PDF" id="pdf-2503.15120" aria-labelledby="pdf-2503.15120">pdf</a>, <a href="https://arxiv.org/html/2503.15120v1" title="View HTML" id="html-2503.15120" aria-labelledby="html-2503.15120" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.15120" title="Other formats" id="oth-2503.15120" aria-labelledby="oth-2503.15120">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Communication Access Real-Time Translation Through Collaborative Correction of Automatic Speech Recognition </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Kuhn,+K">Korbinian Kuhn</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kersken,+V">Verena Kersken</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zimmermann,+G">Gottfried Zimmermann</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 8 pages, 2 figures, to be published in Extended Abstracts of the CHI Conference on Human Factors in Computing Systems (CHI EA '25) </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Human-Computer Interaction (cs.HC)</span>; Sound (cs.SD); Audio and Speech Processing (eess.AS) </div> </div> </dd> <dt> <a name='item9'>[9]</a> <a href ="/abs/2503.14928" title="Abstract" id="2503.14928"> arXiv:2503.14928 </a> (cross-list from cs.CV) [<a href="/pdf/2503.14928" title="Download PDF" id="pdf-2503.14928" aria-labelledby="pdf-2503.14928">pdf</a>, <a href="https://arxiv.org/html/2503.14928v1" title="View HTML" id="html-2503.14928" aria-labelledby="html-2503.14928" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.14928" title="Other formats" id="oth-2503.14928" aria-labelledby="oth-2503.14928">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Shushing! Let's Imagine an Authentic Speech from the Silent Video </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Ye,+J">Jiaxin Ye</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shan,+H">Hongming Shan</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Project Page: <a href="https://imagintalk.github.io" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Artificial Intelligence (cs.AI); Sound (cs.SD); Audio and Speech Processing (eess.AS) </div> </div> </dd> <dt> <a name='item10'>[10]</a> <a href ="/abs/2503.14545" title="Abstract" id="2503.14545"> arXiv:2503.14545 </a> (cross-list from cs.LG) [<a href="/pdf/2503.14545" title="Download PDF" id="pdf-2503.14545" aria-labelledby="pdf-2503.14545">pdf</a>, <a href="https://arxiv.org/html/2503.14545v1" title="View HTML" id="html-2503.14545" aria-labelledby="html-2503.14545" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.14545" title="Other formats" id="oth-2503.14545" aria-labelledby="oth-2503.14545">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> PANDORA: Diffusion Policy Learning for Dexterous Robotic Piano Playing </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+Y">Yanjia Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+R">Renjie Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tu,+Z">Zhengzhong Tu</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Robotics (cs.RO); Sound (cs.SD); Audio and Speech Processing (eess.AS) </div> </div> </dd> </dl> <dl id='articles'> <h3>Wed, 19 Mar 2025 (showing 5 of 5 entries )</h3> <dt> <a name='item11'>[11]</a> <a href ="/abs/2503.14345" title="Abstract" id="2503.14345"> arXiv:2503.14345 </a> (cross-list from eess.AS) [<a href="/pdf/2503.14345" title="Download PDF" id="pdf-2503.14345" aria-labelledby="pdf-2503.14345">pdf</a>, <a href="/format/2503.14345" title="Other formats" id="oth-2503.14345" aria-labelledby="oth-2503.14345">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> MoonCast: High-Quality Zero-Shot Podcast Generation </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&query=Ju,+Z">Zeqian Ju</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Yang,+D">Dongchao Yang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Yu,+J">Jianwei Yu</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Shen,+K">Kai Shen</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Leng,+Y">Yichong Leng</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Wang,+Z">Zhengtao Wang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Tan,+X">Xu Tan</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Zhou,+X">Xinyu Zhou</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Qin,+T">Tao Qin</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Li,+X">Xiangyang Li</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL); Machine Learning (cs.LG); Sound (cs.SD) </div> </div> </dd> <dt> <a name='item12'>[12]</a> <a href ="/abs/2503.14185" title="Abstract" id="2503.14185"> arXiv:2503.14185 </a> (cross-list from cs.CL) [<a href="/pdf/2503.14185" title="Download PDF" id="pdf-2503.14185" aria-labelledby="pdf-2503.14185">pdf</a>, <a href="https://arxiv.org/html/2503.14185v1" title="View HTML" id="html-2503.14185" aria-labelledby="html-2503.14185" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.14185" title="Other formats" id="oth-2503.14185" aria-labelledby="oth-2503.14185">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> AdaST: Dynamically Adapting Encoder States in the Decoder for End-to-End Speech-to-Text Translation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+W">Wuwei Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+D">Dexin Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xiong,+D">Deyi Xiong</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> ACL 2021 Findings </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Sound (cs.SD); Audio and Speech Processing (eess.AS) </div> </div> </dd> <dt> <a name='item13'>[13]</a> <a href ="/abs/2503.14040" title="Abstract" id="2503.14040"> arXiv:2503.14040 </a> (cross-list from cs.GR) [<a href="/pdf/2503.14040" title="Download PDF" id="pdf-2503.14040" aria-labelledby="pdf-2503.14040">pdf</a>, <a href="https://arxiv.org/html/2503.14040v1" title="View HTML" id="html-2503.14040" aria-labelledby="html-2503.14040" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.14040" title="Other formats" id="oth-2503.14040" aria-labelledby="oth-2503.14040">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> MAG: Multi-Modal Aligned Autoregressive Co-Speech Gesture Generation without Vector Quantization </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+B">Binjie Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+L">Lina Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+S">Sanyi Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gu,+S">Songen Gu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhi,+Y">Yihao Zhi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhu,+T">Tianyi Zhu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+L">Lei Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ye,+L">Long Ye</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Graphics (cs.GR)</span>; Computer Vision and Pattern Recognition (cs.CV); Sound (cs.SD) </div> </div> </dd> <dt> <a name='item14'>[14]</a> <a href ="/abs/2503.14036" title="Abstract" id="2503.14036"> arXiv:2503.14036 </a> (cross-list from eess.AS) [<a href="/pdf/2503.14036" title="Download PDF" id="pdf-2503.14036" aria-labelledby="pdf-2503.14036">pdf</a>, <a href="https://arxiv.org/html/2503.14036v1" title="View HTML" id="html-2503.14036" aria-labelledby="html-2503.14036" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.14036" title="Other formats" id="oth-2503.14036" aria-labelledby="oth-2503.14036">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Variational Autoencoder for Personalized Pathological Speech Enhancement </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&query=Hou,+M">Mingchi Hou</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Kodrasi,+I">Ina Kodrasi</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Submitted to EUSIPCO 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span>; Sound (cs.SD) </div> </div> </dd> <dt> <a name='item15'>[15]</a> <a href ="/abs/2503.13763" title="Abstract" id="2503.13763"> arXiv:2503.13763 </a> (cross-list from cs.LG) [<a href="/pdf/2503.13763" title="Download PDF" id="pdf-2503.13763" aria-labelledby="pdf-2503.13763">pdf</a>, <a href="https://arxiv.org/html/2503.13763v1" title="View HTML" id="html-2503.13763" aria-labelledby="html-2503.13763" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.13763" title="Other formats" id="oth-2503.13763" aria-labelledby="oth-2503.13763">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Neural Edge Histogram Descriptors for Underwater Acoustic Target Recognition </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Agashe,+A">Atharva Agashe</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Carreiro,+D">Davelle Carreiro</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Van+Dine,+A">Alexandra Van Dine</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Peeples,+J">Joshua Peeples</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 6 pages, 5 figures. This work has been accepted to IEEE OCEANS 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Sound (cs.SD); Audio and Speech Processing (eess.AS) </div> </div> </dd> </dl> <dl id='articles'> <h3>Tue, 18 Mar 2025 (showing 13 of 13 entries )</h3> <dt> <a name='item16'>[16]</a> <a href ="/abs/2503.12847" title="Abstract" id="2503.12847"> arXiv:2503.12847 </a> [<a href="/pdf/2503.12847" title="Download PDF" id="pdf-2503.12847" aria-labelledby="pdf-2503.12847">pdf</a>, <a href="https://arxiv.org/html/2503.12847v1" title="View HTML" id="html-2503.12847" aria-labelledby="html-2503.12847" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.12847" title="Other formats" id="oth-2503.12847" aria-labelledby="oth-2503.12847">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Robust Audio-Visual Segmentation via Audio-Guided Visual Convergent Alignment </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+C">Chen Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+P">Peike Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+L">Liying Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+D">Dadong Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+L">Lincheng Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yu,+X">Xin Yu</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted by CVPR2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Sound (cs.SD)</span>; Computer Vision and Pattern Recognition (cs.CV) </div> </div> </dd> <dt> <a name='item17'>[17]</a> <a href ="/abs/2503.12840" title="Abstract" id="2503.12840"> arXiv:2503.12840 </a> [<a href="/pdf/2503.12840" title="Download PDF" id="pdf-2503.12840" aria-labelledby="pdf-2503.12840">pdf</a>, <a href="https://arxiv.org/html/2503.12840v1" title="View HTML" id="html-2503.12840" aria-labelledby="html-2503.12840" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.12840" title="Other formats" id="oth-2503.12840" aria-labelledby="oth-2503.12840">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Dynamic Derivation and Elimination: Audio Visual Segmentation with Enhanced Audio Semantics </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+C">Chen Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+L">Liying Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+P">Peike Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+D">Dadong Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+L">Lincheng Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yu,+X">Xin Yu</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted by CVPR2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Sound (cs.SD)</span>; Computer Vision and Pattern Recognition (cs.CV); Audio and Speech Processing (eess.AS) </div> </div> </dd> <dt> <a name='item18'>[18]</a> <a href ="/abs/2503.12589" title="Abstract" id="2503.12589"> arXiv:2503.12589 </a> [<a href="/pdf/2503.12589" title="Download PDF" id="pdf-2503.12589" aria-labelledby="pdf-2503.12589">pdf</a>, <a href="https://arxiv.org/html/2503.12589v1" title="View HTML" id="html-2503.12589" aria-labelledby="html-2503.12589" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.12589" title="Other formats" id="oth-2503.12589" aria-labelledby="oth-2503.12589">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Context-Aware Two-Step Training Scheme for Domain Invariant Speech Separation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+W">Wupeng Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Pan,+Z">Zexu Pan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lin,+J">Jingru Lin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+S">Shuai Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+H">Haizhou Li</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Sound (cs.SD)</span>; Audio and Speech Processing (eess.AS) </div> </div> </dd> <dt> <a name='item19'>[19]</a> <a href ="/abs/2503.12506" title="Abstract" id="2503.12506"> arXiv:2503.12506 </a> [<a href="/pdf/2503.12506" title="Download PDF" id="pdf-2503.12506" aria-labelledby="pdf-2503.12506">pdf</a>, <a href="https://arxiv.org/html/2503.12506v1" title="View HTML" id="html-2503.12506" aria-labelledby="html-2503.12506" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.12506" title="Other formats" id="oth-2503.12506" aria-labelledby="oth-2503.12506">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> A General Close-loop Predictive Coding Framework for Auditory Working Memory </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Yuan,+Z">Zhongju Yuan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wiggins,+G">Geraint Wiggins</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Botteldooren,+D">Dick Botteldooren</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Sound (cs.SD)</span>; Artificial Intelligence (cs.AI); Audio and Speech Processing (eess.AS) </div> </div> </dd> <dt> <a name='item20'>[20]</a> <a href ="/abs/2503.12388" title="Abstract" id="2503.12388"> arXiv:2503.12388 </a> [<a href="/pdf/2503.12388" title="Download PDF" id="pdf-2503.12388" aria-labelledby="pdf-2503.12388">pdf</a>, <a href="https://arxiv.org/html/2503.12388v1" title="View HTML" id="html-2503.12388" aria-labelledby="html-2503.12388" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.12388" title="Other formats" id="oth-2503.12388" aria-labelledby="oth-2503.12388">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Serenade: A Singing Style Conversion Framework Based On Audio Infilling </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Violeta,+L+P">Lester Phillip Violeta</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+W">Wen-Chin Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Toda,+T">Tomoki Toda</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Preprint under review </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Sound (cs.SD)</span>; Audio and Speech Processing (eess.AS) </div> </div> </dd> <dt> <a name='item21'>[21]</a> <a href ="/abs/2503.12115" title="Abstract" id="2503.12115"> arXiv:2503.12115 </a> [<a href="/pdf/2503.12115" title="Download PDF" id="pdf-2503.12115" aria-labelledby="pdf-2503.12115">pdf</a>, <a href="https://arxiv.org/html/2503.12115v1" title="View HTML" id="html-2503.12115" aria-labelledby="html-2503.12115" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.12115" title="Other formats" id="oth-2503.12115" aria-labelledby="oth-2503.12115">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Universal Speech Token Learning via Low-Bitrate Neural Codec and Pretrained Representations </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Jiang,+X">Xue Jiang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Peng,+X">Xiulian Peng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+Y">Yuan Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lu,+Y">Yan Lu</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted by IEEE Journal of Selected Topics in Signal Processing(JSTSP) </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Sound (cs.SD)</span>; Artificial Intelligence (cs.AI); Audio and Speech Processing (eess.AS) </div> </div> </dd> <dt> <a name='item22'>[22]</a> <a href ="/abs/2503.12042" title="Abstract" id="2503.12042"> arXiv:2503.12042 </a> [<a href="/pdf/2503.12042" title="Download PDF" id="pdf-2503.12042" aria-labelledby="pdf-2503.12042">pdf</a>, <a href="https://arxiv.org/html/2503.12042v2" title="View HTML" id="html-2503.12042" aria-labelledby="html-2503.12042" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.12042" title="Other formats" id="oth-2503.12042" aria-labelledby="oth-2503.12042">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Prosody-Enhanced Acoustic Pre-training and Acoustic-Disentangled Prosody Adapting for Movie Dubbing </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+Z">Zhedong Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+L">Liang Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yan,+C">Chenggang Yan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+C">Chunshan Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=van+den+Hengel,+A">Anton van den Hengel</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Qi,+Y">Yuankai Qi</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted by CVPR2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Sound (cs.SD)</span>; Computer Vision and Pattern Recognition (cs.CV); Audio and Speech Processing (eess.AS) </div> </div> </dd> <dt> <a name='item23'>[23]</a> <a href ="/abs/2503.11956" title="Abstract" id="2503.11956"> arXiv:2503.11956 </a> [<a href="/pdf/2503.11956" title="Download PDF" id="pdf-2503.11956" aria-labelledby="pdf-2503.11956">pdf</a>, <a href="https://arxiv.org/html/2503.11956v1" title="View HTML" id="html-2503.11956" aria-labelledby="html-2503.11956" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.11956" title="Other formats" id="oth-2503.11956" aria-labelledby="oth-2503.11956">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Optimization-Based Analysis of Music Intervals and Tuning Systems in Oral Traditions Using Pitch Histograms: A Case Study of Iranian Vocal Music </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Shafiei,+S">Sepideh Shafiei</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hakam,+S">Shapour Hakam</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Sound (cs.SD)</span>; Audio and Speech Processing (eess.AS) </div> </div> </dd> <dt> <a name='item24'>[24]</a> <a href ="/abs/2503.11896" title="Abstract" id="2503.11896"> arXiv:2503.11896 </a> [<a href="/pdf/2503.11896" title="Download PDF" id="pdf-2503.11896" aria-labelledby="pdf-2503.11896">pdf</a>, <a href="https://arxiv.org/html/2503.11896v1" title="View HTML" id="html-2503.11896" aria-labelledby="html-2503.11896" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.11896" title="Other formats" id="oth-2503.11896" aria-labelledby="oth-2503.11896">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Expressive Music Data Processing and Generation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+J">Jingwei Liu</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 7 pages, 4 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Sound (cs.SD)</span>; Artificial Intelligence (cs.AI); Audio and Speech Processing (eess.AS) </div> </div> </dd> <dt> <a name='item25'>[25]</a> <a href ="/abs/2503.12948" title="Abstract" id="2503.12948"> arXiv:2503.12948 </a> (cross-list from eess.AS) [<a href="/pdf/2503.12948" title="Download PDF" id="pdf-2503.12948" aria-labelledby="pdf-2503.12948">pdf</a>, <a href="https://arxiv.org/html/2503.12948v1" title="View HTML" id="html-2503.12948" aria-labelledby="html-2503.12948" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.12948" title="Other formats" id="oth-2503.12948" aria-labelledby="oth-2503.12948">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Past, Present, and Future of Spatial Audio and Room Acoustics </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&query=Koyama,+S">Shoichi Koyama</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=De+Sena,+E">Enzo De Sena</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Samarasinghe,+P">Prasanga Samarasinghe</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Thomas,+M+R+P">Mark R. P. Thomas</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Antonacci,+F">Fabio Antonacci</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted to International Conference on Acoustics, Speech and Signal Processing (ICASSP) 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span>; Sound (cs.SD) </div> </div> </dd> <dt> <a name='item26'>[26]</a> <a href ="/abs/2503.12806" title="Abstract" id="2503.12806"> arXiv:2503.12806 </a> (cross-list from cs.MM) [<a href="/pdf/2503.12806" title="Download PDF" id="pdf-2503.12806" aria-labelledby="pdf-2503.12806">pdf</a>, <a href="https://arxiv.org/html/2503.12806v1" title="View HTML" id="html-2503.12806" aria-labelledby="html-2503.12806" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.12806" title="Other formats" id="oth-2503.12806" aria-labelledby="oth-2503.12806">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> AV-Surf: Surface-Enhanced Geometry-Aware Novel-View Acoustic Synthesis </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Baek,+H">Hadam Baek</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shin,+H">Hannie Shin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Seo,+J">Jiyoung Seo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kim,+C">Chanwoo Kim</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kim,+S">Saerom Kim</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kim,+H">Hyeongbok Kim</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kim,+S">Sangpil Kim</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Multimedia (cs.MM)</span>; Computer Vision and Pattern Recognition (cs.CV); Sound (cs.SD); Audio and Speech Processing (eess.AS) </div> </div> </dd> <dt> <a name='item27'>[27]</a> <a href ="/abs/2503.12261" title="Abstract" id="2503.12261"> arXiv:2503.12261 </a> (cross-list from cs.CV) [<a href="/pdf/2503.12261" title="Download PDF" id="pdf-2503.12261" aria-labelledby="pdf-2503.12261">pdf</a>, <a href="https://arxiv.org/html/2503.12261v1" title="View HTML" id="html-2503.12261" aria-labelledby="html-2503.12261" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.12261" title="Other formats" id="oth-2503.12261" aria-labelledby="oth-2503.12261">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Handling Weak Complementary Relationships for Audio-Visual Emotion Recognition </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Praveen,+R+G">R. Gnana Praveen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Alam,+J">Jahangir Alam</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Submission to valence arousal track of 8th ABAW competition. arXiv admin note: substantial text overlap with <a href="https://arxiv.org/abs/2403.13659" data-arxiv-id="2403.13659" class="link-https">arXiv:2403.13659</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Sound (cs.SD); Audio and Speech Processing (eess.AS) </div> </div> </dd> <dt> <a name='item28'>[28]</a> <a href ="/abs/2503.12131" title="Abstract" id="2503.12131"> arXiv:2503.12131 </a> (cross-list from cs.CV) [<a href="/pdf/2503.12131" title="Download PDF" id="pdf-2503.12131" aria-labelledby="pdf-2503.12131">pdf</a>, <a href="https://arxiv.org/html/2503.12131v1" title="View HTML" id="html-2503.12131" aria-labelledby="html-2503.12131" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.12131" title="Other formats" id="oth-2503.12131" aria-labelledby="oth-2503.12131">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> DiffGAP: A Lightweight Diffusion Module in Contrastive Space for Bridging Cross-Model Gap </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Mo,+S">Shentong Mo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+Z">Zehua Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bao,+F">Fan Bao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhu,+J">Jun Zhu</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Artificial Intelligence (cs.AI); Machine Learning (cs.LG); Sound (cs.SD); Audio and Speech Processing (eess.AS) </div> </div> </dd> </dl> <dl id='articles'> <h3>Mon, 17 Mar 2025 (showing 11 of 11 entries )</h3> <dt> <a name='item29'>[29]</a> <a href ="/abs/2503.11627" title="Abstract" id="2503.11627"> arXiv:2503.11627 </a> [<a href="/pdf/2503.11627" title="Download PDF" id="pdf-2503.11627" aria-labelledby="pdf-2503.11627">pdf</a>, <a href="https://arxiv.org/html/2503.11627v1" title="View HTML" id="html-2503.11627" aria-labelledby="html-2503.11627" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.11627" title="Other formats" id="oth-2503.11627" aria-labelledby="oth-2503.11627">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Are Deep Speech Denoising Models Robust to Adversarial Noise? </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Schwarzer,+W">Will Schwarzer</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Thomas,+P+S">Philip S. Thomas</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Fanelli,+A">Andrea Fanelli</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+X">Xiaoyu Liu</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 13 pages, 5 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Sound (cs.SD)</span>; Machine Learning (cs.LG); Audio and Speech Processing (eess.AS) </div> </div> </dd> <dt> <a name='item30'>[30]</a> <a href ="/abs/2503.11562" title="Abstract" id="2503.11562"> arXiv:2503.11562 </a> [<a href="/pdf/2503.11562" title="Download PDF" id="pdf-2503.11562" aria-labelledby="pdf-2503.11562">pdf</a>, <a href="https://arxiv.org/html/2503.11562v1" title="View HTML" id="html-2503.11562" aria-labelledby="html-2503.11562" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.11562" title="Other formats" id="oth-2503.11562" aria-labelledby="oth-2503.11562">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Designing Neural Synthesizers for Low Latency Interaction </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Caspe,+F">Franco Caspe</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shier,+J">Jordie Shier</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sandler,+M">Mark Sandler</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Saitis,+C">Charalampos Saitis</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=McPherson,+A">Andrew McPherson</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> See website at <a href="http://fcaspe.github.io/brave" rel="external noopener nofollow" class="link-external link-http">this http URL</a> - 13 pages, 5 figures, accepted to the Journal of the Audio Engineering Society </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Sound (cs.SD)</span>; Artificial Intelligence (cs.AI); Machine Learning (cs.LG); Audio and Speech Processing (eess.AS) </div> </div> </dd> <dt> <a name='item31'>[31]</a> <a href ="/abs/2503.11373" title="Abstract" id="2503.11373"> arXiv:2503.11373 </a> [<a href="/pdf/2503.11373" title="Download PDF" id="pdf-2503.11373" aria-labelledby="pdf-2503.11373">pdf</a>, <a href="https://arxiv.org/html/2503.11373v1" title="View HTML" id="html-2503.11373" aria-labelledby="html-2503.11373" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.11373" title="Other formats" id="oth-2503.11373" aria-labelledby="oth-2503.11373">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Exploring Performance-Complexity Trade-Offs in Sound Event Detection </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Morocutti,+T">Tobias Morocutti</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Schmid,+F">Florian Schmid</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Greif,+J">Jonathan Greif</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Foscarin,+F">Francesco Foscarin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Widmer,+G">Gerhard Widmer</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Sound (cs.SD)</span>; Machine Learning (cs.LG); Audio and Speech Processing (eess.AS) </div> </div> </dd> <dt> <a name='item32'>[32]</a> <a href ="/abs/2503.11363" title="Abstract" id="2503.11363"> arXiv:2503.11363 </a> [<a href="/pdf/2503.11363" title="Download PDF" id="pdf-2503.11363" aria-labelledby="pdf-2503.11363">pdf</a>, <a href="https://arxiv.org/html/2503.11363v1" title="View HTML" id="html-2503.11363" aria-labelledby="html-2503.11363" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.11363" title="Other formats" id="oth-2503.11363" aria-labelledby="oth-2503.11363">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Creating a Good Teacher for Knowledge Distillation in Acoustic Scene Classification </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Morocutti,+T">Tobias Morocutti</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Schmid,+F">Florian Schmid</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Koutini,+K">Khaled Koutini</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Widmer,+G">Gerhard Widmer</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Sound (cs.SD)</span>; Machine Learning (cs.LG); Audio and Speech Processing (eess.AS) </div> </div> </dd> <dt> <a name='item33'>[33]</a> <a href ="/abs/2503.11229" title="Abstract" id="2503.11229"> arXiv:2503.11229 </a> [<a href="/pdf/2503.11229" title="Download PDF" id="pdf-2503.11229" aria-labelledby="pdf-2503.11229">pdf</a>, <a href="https://arxiv.org/html/2503.11229v1" title="View HTML" id="html-2503.11229" aria-labelledby="html-2503.11229" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.11229" title="Other formats" id="oth-2503.11229" aria-labelledby="oth-2503.11229">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Exploring the Potential of Large Multimodal Models as Effective Alternatives for Pronunciation Assessment </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+K">Ke Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=He,+L">Lei He</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+K">Kun Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Deng,+Y">Yan Deng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wei,+W">Wenning Wei</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhao,+S">Sheng Zhao</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 7 pages </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Sound (cs.SD)</span>; Computation and Language (cs.CL); Audio and Speech Processing (eess.AS) </div> </div> </dd> <dt> <a name='item34'>[34]</a> <a href ="/abs/2503.11206" title="Abstract" id="2503.11206"> arXiv:2503.11206 </a> [<a href="/pdf/2503.11206" title="Download PDF" id="pdf-2503.11206" aria-labelledby="pdf-2503.11206">pdf</a>, <a href="https://arxiv.org/html/2503.11206v1" title="View HTML" id="html-2503.11206" aria-labelledby="html-2503.11206" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.11206" title="Other formats" id="oth-2503.11206" aria-labelledby="oth-2503.11206">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Comparative Study of Spike Encoding Methods for Environmental Sound Classification </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Larroza,+A">Andres Larroza</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Naranjo-Alcazar,+J">Javier Naranjo-Alcazar</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Castell%C3%B3,+V+O">Vicent Ortiz Castell贸</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zuccarello,+P">Pedro Zuccarello</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Under review EUSIPCO 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Sound (cs.SD)</span>; Emerging Technologies (cs.ET); Audio and Speech Processing (eess.AS) </div> </div> </dd> <dt> <a name='item35'>[35]</a> <a href ="/abs/2503.11197" title="Abstract" id="2503.11197"> arXiv:2503.11197 </a> [<a href="/pdf/2503.11197" title="Download PDF" id="pdf-2503.11197" aria-labelledby="pdf-2503.11197">pdf</a>, <a href="https://arxiv.org/html/2503.11197v3" title="View HTML" id="html-2503.11197" aria-labelledby="html-2503.11197" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.11197" title="Other formats" id="oth-2503.11197" aria-labelledby="oth-2503.11197">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Reinforcement Learning Outperforms Supervised Fine-Tuning: A Case Study on Audio Question Answering </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+G">Gang Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+J">Jizhong Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Dinkel,+H">Heinrich Dinkel</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Niu,+Y">Yadong Niu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+J">Junbo Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Luan,+J">Jian Luan</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Sound (cs.SD)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL); Audio and Speech Processing (eess.AS) </div> </div> </dd> <dt> <a name='item36'>[36]</a> <a href ="/abs/2503.11190" title="Abstract" id="2503.11190"> arXiv:2503.11190 </a> [<a href="/pdf/2503.11190" title="Download PDF" id="pdf-2503.11190" aria-labelledby="pdf-2503.11190">pdf</a>, <a href="https://arxiv.org/html/2503.11190v1" title="View HTML" id="html-2503.11190" aria-labelledby="html-2503.11190" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.11190" title="Other formats" id="oth-2503.11190" aria-labelledby="oth-2503.11190">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Cross-Modal Learning for Music-to-Music-Video Description Generation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Mao,+Z">Zhuoyuan Mao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhao,+M">Mengjie Zhao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+Q">Qiyu Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhong,+Z">Zhi Zhong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liao,+W">Wei-Hsiang Liao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wakaki,+H">Hiromi Wakaki</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mitsufuji,+Y">Yuki Mitsufuji</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted by RepL4NLP 2025 @ NAACL 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Sound (cs.SD)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL); Multimedia (cs.MM); Audio and Speech Processing (eess.AS) </div> </div> </dd> <dt> <a name='item37'>[37]</a> <a href ="/abs/2503.11315" title="Abstract" id="2503.11315"> arXiv:2503.11315 </a> (cross-list from cs.CV) [<a href="/pdf/2503.11315" title="Download PDF" id="pdf-2503.11315" aria-labelledby="pdf-2503.11315">pdf</a>, <a href="https://arxiv.org/html/2503.11315v1" title="View HTML" id="html-2503.11315" aria-labelledby="html-2503.11315" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.11315" title="Other formats" id="oth-2503.11315" aria-labelledby="oth-2503.11315">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> MMS-LLaMA: Efficient LLM-based Audio-Visual Speech Recognition with Minimal Multimodal Speech Tokens </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Yeo,+J+H">Jeong Hun Yeo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Rha,+H">Hyeongseop Rha</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Park,+S+J">Se Jin Park</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ro,+Y+M">Yong Man Ro</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> The code and models are available <a href="https://github.com/JeongHun0716/MMS-LLaMA" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Multimedia (cs.MM); Sound (cs.SD); Audio and Speech Processing (eess.AS) </div> </div> </dd> <dt> <a name='item38'>[38]</a> <a href ="/abs/2503.11312" title="Abstract" id="2503.11312"> arXiv:2503.11312 </a> (cross-list from eess.SP) [<a href="/pdf/2503.11312" title="Download PDF" id="pdf-2503.11312" aria-labelledby="pdf-2503.11312">pdf</a>, <a href="https://arxiv.org/html/2503.11312v1" title="View HTML" id="html-2503.11312" aria-labelledby="html-2503.11312" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.11312" title="Other formats" id="oth-2503.11312" aria-labelledby="oth-2503.11312">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> A Data-Driven Exploration of Elevation Cues in HRTFs: An Explainable AI Perspective Across Multiple Datasets </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&query=De+Rus,+J+A">Juan Antonio De Rus</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Montagud,+M">Mario Montagud</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Lopez-Ballester,+J">Jesus Lopez-Ballester</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Ferri,+F+J">Francesc J. Ferri</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Cobos,+M">Maximo Cobos</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 14 pages, 9 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Signal Processing (eess.SP)</span>; Sound (cs.SD); Audio and Speech Processing (eess.AS) </div> </div> </dd> <dt> <a name='item39'>[39]</a> <a href ="/abs/2503.11080" title="Abstract" id="2503.11080"> arXiv:2503.11080 </a> (cross-list from cs.CL) [<a href="/pdf/2503.11080" title="Download PDF" id="pdf-2503.11080" aria-labelledby="pdf-2503.11080">pdf</a>, <a href="https://arxiv.org/html/2503.11080v1" title="View HTML" id="html-2503.11080" aria-labelledby="html-2503.11080" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.11080" title="Other formats" id="oth-2503.11080" aria-labelledby="oth-2503.11080">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Joint Training And Decoding for Multilingual End-to-End Simultaneous Speech Translation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+W">Wuwei Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jin,+R">Renren Jin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+W">Wen Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Luan,+J">Jian Luan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+B">Bin Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xiong,+D">Deyi Xiong</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> ICASSP 2023 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Sound (cs.SD); Audio and Speech Processing (eess.AS) </div> </div> </dd> </dl> <div class='paging'>Total of 39 entries </div> <div class='morefewer'>Showing up to 50 entries per page: <a href=/list/cs.SD/recent?skip=0&show=25 rel="nofollow"> fewer</a> | <span style="color: #454545">more</span> | <span style="color: #454545">all</span> </div> </div> </div> </div> </main> <footer style="clear: both;"> <div class="columns is-desktop" role="navigation" aria-label="Secondary" style="margin: -0.75em -0.75em 0.75em -0.75em">  <div class="column" style="padding: 0;"> <div class="columns"> <div class="column"> <ul style="list-style: none; line-height: 2;"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul style="list-style: none; line-height: 2;"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column" style="padding: 0;"> <div class="columns"> <div class="column"> <ul style="list-style: none; line-height: 2;"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul style="list-style: none; line-height: 2;"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>   </div> </footer> </div> <script src="/static/base/1.0.1/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Sound