CINXE.COM

Audio and Speech Processing

<!DOCTYPE html> <html lang="en"> <head> <title>Audio and Speech Processing </title> <meta name="viewport" content="width=device-width, initial-scale=1"> <link rel="apple-touch-icon" sizes="180x180" href="/static/browse/0.3.4/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="/static/browse/0.3.4/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="/static/browse/0.3.4/images/icons/favicon-16x16.png"> <link rel="manifest" href="/static/browse/0.3.4/images/icons/site.webmanifest"> <link rel="mask-icon" href="/static/browse/0.3.4/images/icons/safari-pinned-tab.svg" color="#5bbad5"> <meta name="msapplication-TileColor" content="#da532c"> <meta name="theme-color" content="#ffffff"> <link rel="stylesheet" type="text/css" media="screen" href="/static/browse/0.3.4/css/arXiv.css?v=20241206" /> <link rel="stylesheet" type="text/css" media="print" href="/static/browse/0.3.4/css/arXiv-print.css?v=20200611" /> <link rel="stylesheet" type="text/css" media="screen" href="/static/browse/0.3.4/css/browse_search.css" /> <script language="javascript" src="/static/browse/0.3.4/js/accordion.js" /></script> <script src="/static/browse/0.3.4/js/mathjaxToggle.min.js" type="text/javascript"></script> <script type="text/javascript" language="javascript">mathjaxToggle();</script> </head> <body class="with-cu-identity"> <div class="flex-wrap-footer"> <header> <a href="#content" class="is-sr-only">Skip to main content</a> <!-- start desktop header --> <div class="columns is-vcentered is-hidden-mobile" id="cu-identity"> <div class="column" id="cu-logo"> <a href="https://www.cornell.edu/"><img src="/static/browse/0.3.4/images/icons/cu/cornell-reduced-white-SMALL.svg" alt="Cornell University" /></a> </div><div class="column" id="support-ack"> <span id="support-ack-url">We gratefully acknowledge support from the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors.</span> <a href="https://info.arxiv.org/about/donate.html" class="btn-header-donate">Donate</a> </div> </div> <div id="header" class="is-hidden-mobile"> <a aria-hidden="true" tabindex="-1" href="/IgnoreMe"></a> <div class="header-breadcrumbs"> <a href="/"><img src="/static/browse/0.3.4/images/arxiv-logo-one-color-white.svg" alt="arxiv logo" style="height:40px;"/></a> <span>&gt;</span> <a href="/list/eess.AS/recent">eess.AS</a> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div><!-- /end desktop header --> <div class="mobile-header"> <div class="columns is-mobile"> <div class="column logo-arxiv"><a href="https://arxiv.org/"><img src="/static/browse/0.3.4/images/arxiv-logomark-small-white.svg" alt="arXiv logo" style="height:60px;" /></a></div> <div class="column logo-cornell"><a href="https://www.cornell.edu/"> <picture> <source media="(min-width: 501px)" srcset="/static/browse/0.3.4/images/icons/cu/cornell-reduced-white-SMALL.svg 400w" sizes="400w" /> <source srcset="/static/browse/0.3.4/images/icons/cu/cornell_seal_simple_black.svg 2x" /> <img src="/static/browse/0.3.4/images/icons/cu/cornell-reduced-white-SMALL.svg" alt="Cornell University Logo" /> </picture> </a></div> <div class="column nav" id="toggle-container" role="menubar"> <button class="toggle-control"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-white"><title>open search</title><path d="M505 442.7L405.3 343c-4.5-4.5-10.6-7-17-7H372c27.6-35.3 44-79.7 44-128C416 93.1 322.9 0 208 0S0 93.1 0 208s93.1 208 208 208c48.3 0 92.7-16.4 128-44v16.3c0 6.4 2.5 12.5 7 17l99.7 99.7c9.4 9.4 24.6 9.4 33.9 0l28.3-28.3c9.4-9.4 9.4-24.6.1-34zM208 336c-70.7 0-128-57.2-128-128 0-70.7 57.2-128 128-128 70.7 0 128 57.2 128 128 0 70.7-57.2 128-128 128z"/></svg></button> <div class="mobile-toggle-block toggle-target"> <form class="mobile-search-form" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <input class="input" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <input type="hidden" name="source" value="header"> <input type="hidden" name="searchtype" value="all"> <button class="button">GO</button> </div> </form> </div> <button class="toggle-control"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-white" role="menu"><title>open navigation menu</title><path d="M16 132h416c8.837 0 16-7.163 16-16V76c0-8.837-7.163-16-16-16H16C7.163 60 0 67.163 0 76v40c0 8.837 7.163 16 16 16zm0 160h416c8.837 0 16-7.163 16-16v-40c0-8.837-7.163-16-16-16H16c-8.837 0-16 7.163-16 16v40c0 8.837 7.163 16 16 16zm0 160h416c8.837 0 16-7.163 16-16v-40c0-8.837-7.163-16-16-16H16c-8.837 0-16 7.163-16 16v40c0 8.837 7.163 16 16 16z"/ ></svg></button> <div class="mobile-toggle-block toggle-target"> <nav class="mobile-menu" aria-labelledby="mobilemenulabel"> <h2 id="mobilemenulabel">quick links</h2> <ul> <li><a href="https://arxiv.org/login">Login</a></li> <li><a href="https://info.arxiv.org/help">Help Pages</a></li> <li><a href="https://info.arxiv.org/about">About</a></li> </ul> </nav> </div> </div> </div> </div><!-- /end mobile-header --> </header> <main> <div id="content"> <div id='content-inner'> <div id='dlpage'> <h1>Audio and Speech Processing</h1> <h2>Authors and titles for recent submissions</h2> <ul> <li> <a href="/list/eess.AS/recent?skip=0&amp;show=50"> Fri, 21 Feb 2025 </a> </li><li> <a href="/list/eess.AS/recent?skip=13&amp;show=50"> Thu, 20 Feb 2025 </a> </li><li> <a href="/list/eess.AS/recent?skip=21&amp;show=50"> Wed, 19 Feb 2025 </a> </li><li> <a href="/list/eess.AS/recent?skip=24&amp;show=50"> Tue, 18 Feb 2025 </a> </li><li> <a href="/list/eess.AS/recent?skip=40&amp;show=50"> Mon, 17 Feb 2025 </a> </li></ul> <p>See today's <a id="new-eess.AS" aria-labelledby="new-eess.AS" href="/list/eess.AS/new">new</a> changes</p> <div class='paging'>Total of 50 entries </div> <div class='morefewer'>Showing up to 50 entries per page: <a href=/list/eess.AS/recent?skip=0&amp;show=25 rel="nofollow"> fewer</a> | <span style="color: #454545">more</span> | <span style="color: #454545">all</span> </div> <dl id='articles'> <h3>Fri, 21 Feb 2025 (showing 13 of 13 entries )</h3> <dt> <a name='item1'>[1]</a> <a href ="/abs/2502.14418" title="Abstract" id="2502.14418"> arXiv:2502.14418 </a> [<a href="/pdf/2502.14418" title="Download PDF" id="pdf-2502.14418" aria-labelledby="pdf-2502.14418">pdf</a>, <a href="https://arxiv.org/html/2502.14418v1" title="View HTML" id="html-2502.14418" aria-labelledby="html-2502.14418" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14418" title="Other formats" id="oth-2502.14418" aria-labelledby="oth-2502.14418">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Role of the Pretraining and the Adaptation data sizes for low-resource real-time MRI video segmentation </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Tholan,+M+T">Masoud Thajudeen Tholan</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Hegde,+V">Vinayaka Hegde</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Sharma,+C">Chetan Sharma</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Ghosh,+P+K">Prasanta Kumar Ghosh</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted to ICASSP 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span>; Computer Vision and Pattern Recognition (cs.CV); Signal Processing (eess.SP) </div> </div> </dd> <dt> <a name='item2'>[2]</a> <a href ="/abs/2502.14224" title="Abstract" id="2502.14224"> arXiv:2502.14224 </a> [<a href="/pdf/2502.14224" title="Download PDF" id="pdf-2502.14224" aria-labelledby="pdf-2502.14224">pdf</a>, <a href="https://arxiv.org/html/2502.14224v1" title="View HTML" id="html-2502.14224" aria-labelledby="html-2502.14224" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14224" title="Other formats" id="oth-2502.14224" aria-labelledby="oth-2502.14224">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Adaptive Convolution for CNN-based Speech Enhancement Models </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Wang,+D">Dahan Wang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Rong,+X">Xiaobin Rong</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Sun,+S">Shiruo Sun</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Hu,+Y">Yuxiang Hu</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Zhu,+C">Changbao Zhu</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Lu,+J">Jing Lu</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Submitted to IEEE/ACM Transactions on Audio, Speech, and Language Processing </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span>; Sound (cs.SD) </div> </div> </dd> <dt> <a name='item3'>[3]</a> <a href ="/abs/2502.13983" title="Abstract" id="2502.13983"> arXiv:2502.13983 </a> [<a href="/pdf/2502.13983" title="Download PDF" id="pdf-2502.13983" aria-labelledby="pdf-2502.13983">pdf</a>, <a href="https://arxiv.org/html/2502.13983v1" title="View HTML" id="html-2502.13983" aria-labelledby="html-2502.13983" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.13983" title="Other formats" id="oth-2502.13983" aria-labelledby="oth-2502.13983">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Gesture-Aware Zero-Shot Speech Recognition for Patients with Language Disorders </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Kim,+S">Seungbae Kim</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Lee,+D">Daeun Lee</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Stark,+B">Brielle Stark</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Han,+J">Jinyoung Han</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span>; Artificial Intelligence (cs.AI) </div> </div> </dd> <dt> <a name='item4'>[4]</a> <a href ="/abs/2502.13982" title="Abstract" id="2502.13982"> arXiv:2502.13982 </a> [<a href="/pdf/2502.13982" title="Download PDF" id="pdf-2502.13982" aria-labelledby="pdf-2502.13982">pdf</a>, <a href="https://arxiv.org/html/2502.13982v1" title="View HTML" id="html-2502.13982" aria-labelledby="html-2502.13982" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.13982" title="Other formats" id="oth-2502.13982" aria-labelledby="oth-2502.13982">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Benchmarking Automatic Speech Recognition coupled LLM Modules for Medical Diagnostics </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Kumar,+K">Kabir Kumar</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span>; Machine Learning (cs.LG) </div> </div> </dd> <dt> <a name='item5'>[5]</a> <a href ="/abs/2502.14727" title="Abstract" id="2502.14727"> arXiv:2502.14727 </a> (cross-list from cs.SD) [<a href="/pdf/2502.14727" title="Download PDF" id="pdf-2502.14727" aria-labelledby="pdf-2502.14727">pdf</a>, <a href="https://arxiv.org/html/2502.14727v1" title="View HTML" id="html-2502.14727" aria-labelledby="html-2502.14727" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14727" title="Other formats" id="oth-2502.14727" aria-labelledby="oth-2502.14727">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> WavRAG: Audio-Integrated Retrieval Augmented Generation for Spoken Dialogue Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Chen,+Y">Yifu Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Ji,+S">Shengpeng Ji</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wang,+H">Haoxiao Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wang,+Z">Ziqing Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Chen,+S">Siyu Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=He,+J">Jinzheng He</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Xu,+J">Jin Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhao,+Z">Zhou Zhao</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Sound (cs.SD)</span>; Artificial Intelligence (cs.AI); Audio and Speech Processing (eess.AS) </div> </div> </dd> <dt> <a name='item6'>[6]</a> <a href ="/abs/2502.14726" title="Abstract" id="2502.14726"> arXiv:2502.14726 </a> (cross-list from cs.SD) [<a href="/pdf/2502.14726" title="Download PDF" id="pdf-2502.14726" aria-labelledby="pdf-2502.14726">pdf</a>, <a href="https://arxiv.org/html/2502.14726v1" title="View HTML" id="html-2502.14726" aria-labelledby="html-2502.14726" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14726" title="Other formats" id="oth-2502.14726" aria-labelledby="oth-2502.14726">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Pitch Imperfect: Detecting Audio Deepfakes Through Acoustic Prosodic Analysis </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Warren,+K">Kevin Warren</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Olszewski,+D">Daniel Olszewski</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Layton,+S">Seth Layton</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Butler,+K">Kevin Butler</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Gates,+C">Carrie Gates</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Traynor,+P">Patrick Traynor</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Sound (cs.SD)</span>; Cryptography and Security (cs.CR); Audio and Speech Processing (eess.AS) </div> </div> </dd> <dt> <a name='item7'>[7]</a> <a href ="/abs/2502.14685" title="Abstract" id="2502.14685"> arXiv:2502.14685 </a> (cross-list from cs.SD) [<a href="/pdf/2502.14685" title="Download PDF" id="pdf-2502.14685" aria-labelledby="pdf-2502.14685">pdf</a>, <a href="https://arxiv.org/html/2502.14685v1" title="View HTML" id="html-2502.14685" aria-labelledby="html-2502.14685" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14685" title="Other formats" id="oth-2502.14685" aria-labelledby="oth-2502.14685">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> SegAug: CTC-Aligned Segmented Augmentation For Robust RNN-Transducer Based Speech Recognition </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Le,+K">Khanh Le</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Ho,+T+V">Tuan Vu Ho</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Tran,+D">Dung Tran</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Chau,+D+T">Duc Thanh Chau</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted to ICASSP 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Sound (cs.SD)</span>; Audio and Speech Processing (eess.AS) </div> </div> </dd> <dt> <a name='item8'>[8]</a> <a href ="/abs/2502.14673" title="Abstract" id="2502.14673"> arXiv:2502.14673 </a> (cross-list from cs.SD) [<a href="/pdf/2502.14673" title="Download PDF" id="pdf-2502.14673" aria-labelledby="pdf-2502.14673">pdf</a>, <a href="https://arxiv.org/html/2502.14673v1" title="View HTML" id="html-2502.14673" aria-labelledby="html-2502.14673" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14673" title="Other formats" id="oth-2502.14673" aria-labelledby="oth-2502.14673">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> ChunkFormer: Masked Chunking Conformer For Long-Form Speech Transcription </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Le,+K">Khanh Le</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Ho,+T+V">Tuan Vu Ho</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Tran,+D">Dung Tran</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Chau,+D+T">Duc Thanh Chau</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted to ICASSP 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Sound (cs.SD)</span>; Audio and Speech Processing (eess.AS) </div> </div> </dd> <dt> <a name='item9'>[9]</a> <a href ="/abs/2502.14627" title="Abstract" id="2502.14627"> arXiv:2502.14627 </a> (cross-list from cs.SD) [<a href="/pdf/2502.14627" title="Download PDF" id="pdf-2502.14627" aria-labelledby="pdf-2502.14627">pdf</a>, <a href="https://arxiv.org/html/2502.14627v1" title="View HTML" id="html-2502.14627" aria-labelledby="html-2502.14627" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14627" title="Other formats" id="oth-2502.14627" aria-labelledby="oth-2502.14627">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> ATRI: Mitigating Multilingual Audio Text Retrieval Inconsistencies by Reducing Data Distribution Errors </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Yin,+Y">Yuguo Yin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Xie,+Y">Yuxin Xie</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Yang,+W">Wenyuan Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Yang,+D">Dongchao Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Ru,+J">Jinghan Ru</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhuang,+X">Xianwei Zhuang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Liang,+L">Liming Liang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zou,+Y">Yuexian Zou</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Sound (cs.SD)</span>; Artificial Intelligence (cs.AI); Audio and Speech Processing (eess.AS) </div> </div> </dd> <dt> <a name='item10'>[10]</a> <a href ="/abs/2502.14405" title="Abstract" id="2502.14405"> arXiv:2502.14405 </a> (cross-list from cs.SD) [<a href="/pdf/2502.14405" title="Download PDF" id="pdf-2502.14405" aria-labelledby="pdf-2502.14405">pdf</a>, <a href="https://arxiv.org/html/2502.14405v1" title="View HTML" id="html-2502.14405" aria-labelledby="html-2502.14405" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14405" title="Other formats" id="oth-2502.14405" aria-labelledby="oth-2502.14405">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Differentiable Black-box and Gray-box Modeling of Nonlinear Audio Effects </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Comunit%C3%A0,+M">Marco Comunit脿</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Steinmetz,+C+J">Christian J. Steinmetz</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Reiss,+J+D">Joshua D. Reiss</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Sound (cs.SD)</span>; Audio and Speech Processing (eess.AS) </div> </div> </dd> <dt> <a name='item11'>[11]</a> <a href ="/abs/2502.14178" title="Abstract" id="2502.14178"> arXiv:2502.14178 </a> (cross-list from cs.GR) [<a href="/pdf/2502.14178" title="Download PDF" id="pdf-2502.14178" aria-labelledby="pdf-2502.14178">pdf</a>, <a href="https://arxiv.org/html/2502.14178v1" title="View HTML" id="html-2502.14178" aria-labelledby="html-2502.14178" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14178" title="Other formats" id="oth-2502.14178" aria-labelledby="oth-2502.14178">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> NeRF-3DTalker: Neural Radiance Field with 3D Prior Aided Audio Disentanglement for Talking Head Synthesis </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Liu,+X">Xiaoxing Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Liu,+Z">Zhilei Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Bi,+C">Chongke Bi</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted by ICASSP 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Graphics (cs.GR)</span>; Computer Vision and Pattern Recognition (cs.CV); Multimedia (cs.MM); Sound (cs.SD); Audio and Speech Processing (eess.AS) </div> </div> </dd> <dt> <a name='item12'>[12]</a> <a href ="/abs/2502.14145" title="Abstract" id="2502.14145"> arXiv:2502.14145 </a> (cross-list from cs.CL) [<a href="/pdf/2502.14145" title="Download PDF" id="pdf-2502.14145" aria-labelledby="pdf-2502.14145">pdf</a>, <a href="https://arxiv.org/html/2502.14145v1" title="View HTML" id="html-2502.14145" aria-labelledby="html-2502.14145" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14145" title="Other formats" id="oth-2502.14145" aria-labelledby="oth-2502.14145">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> LLM-Enhanced Dialogue Management for Full-Duplex Spoken Dialogue Systems </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhang,+H">Hao Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Li,+W">Weiwei Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Chen,+R">Rilin Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Kothapally,+V">Vinay Kothapally</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Yu,+M">Meng Yu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Yu,+D">Dong Yu</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> In submission to INTERSPEECH 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Audio and Speech Processing (eess.AS) </div> </div> </dd> <dt> <a name='item13'>[13]</a> <a href ="/abs/2502.14110" title="Abstract" id="2502.14110"> arXiv:2502.14110 </a> (cross-list from cs.SD) [<a href="/pdf/2502.14110" title="Download PDF" id="pdf-2502.14110" aria-labelledby="pdf-2502.14110">pdf</a>, <a href="https://arxiv.org/html/2502.14110v1" title="View HTML" id="html-2502.14110" aria-labelledby="html-2502.14110" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.14110" title="Other formats" id="oth-2502.14110" aria-labelledby="oth-2502.14110">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> On the application of Visibility Graphs in the Spectral Domain for Speaker Recognition </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Bocaccio,+H">Hernan Bocaccio</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Iglesias-P%C3%A9rez,+S">Sergio Iglesias-P茅rez</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Romance,+M">Miguel Romance</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Criado,+R">Regino Criado</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Mindlin,+G+B">Gabriel B. Mindlin</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 13 pages, 5 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Sound (cs.SD)</span>; Audio and Speech Processing (eess.AS) </div> </div> </dd> </dl> <dl id='articles'> <h3>Thu, 20 Feb 2025 (showing 8 of 8 entries )</h3> <dt> <a name='item14'>[14]</a> <a href ="/abs/2502.13473" title="Abstract" id="2502.13473"> arXiv:2502.13473 </a> [<a href="/pdf/2502.13473" title="Download PDF" id="pdf-2502.13473" aria-labelledby="pdf-2502.13473">pdf</a>, <a href="https://arxiv.org/html/2502.13473v1" title="View HTML" id="html-2502.13473" aria-labelledby="html-2502.13473" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.13473" title="Other formats" id="oth-2502.13473" aria-labelledby="oth-2502.13473">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Multi-channel Replay Speech Detection using an Adaptive Learnable Beamformer </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Neri,+M">Michael Neri</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Virtanen,+T">Tuomas Virtanen</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Submitted to IEEE Open Journal of Signal Processing </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span>; Signal Processing (eess.SP) </div> </div> </dd> <dt> <a name='item15'>[15]</a> <a href ="/abs/2502.13446" title="Abstract" id="2502.13446"> arXiv:2502.13446 </a> [<a href="/pdf/2502.13446" title="Download PDF" id="pdf-2502.13446" aria-labelledby="pdf-2502.13446">pdf</a>, <a href="https://arxiv.org/html/2502.13446v1" title="View HTML" id="html-2502.13446" aria-labelledby="html-2502.13446" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.13446" title="Other formats" id="oth-2502.13446" aria-labelledby="oth-2502.13446">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Adopting Whisper for Confidence Estimation </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Aggarwal,+V">Vaibhav Aggarwal</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Nair,+S+S">Shabari S Nair</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Verma,+Y">Yash Verma</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Jogi,+Y">Yash Jogi</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted at IEEE ICASSP 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span>; Machine Learning (cs.LG) </div> </div> </dd> <dt> <a name='item16'>[16]</a> <a href ="/abs/2502.13893" title="Abstract" id="2502.13893"> arXiv:2502.13893 </a> (cross-list from cs.SD) [<a href="/pdf/2502.13893" title="Download PDF" id="pdf-2502.13893" aria-labelledby="pdf-2502.13893">pdf</a>, <a href="https://arxiv.org/html/2502.13893v1" title="View HTML" id="html-2502.13893" aria-labelledby="html-2502.13893" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.13893" title="Other formats" id="oth-2502.13893" aria-labelledby="oth-2502.13893">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Audio-Based Classification of Insect Species Using Machine Learning Models: Cicada, Beetle, Termite, and Cricket </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Shetty,+M+V">Manas V Shetty</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Kumar,+Y+D+S">Yoga Disha Sendhil Kumar</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Sound (cs.SD)</span>; Audio and Speech Processing (eess.AS) </div> </div> </dd> <dt> <a name='item17'>[17]</a> <a href ="/abs/2502.13713" title="Abstract" id="2502.13713"> arXiv:2502.13713 </a> (cross-list from cs.IR) [<a href="/pdf/2502.13713" title="Download PDF" id="pdf-2502.13713" aria-labelledby="pdf-2502.13713">pdf</a>, <a href="https://arxiv.org/html/2502.13713v2" title="View HTML" id="html-2502.13713" aria-labelledby="html-2502.13713" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.13713" title="Other formats" id="oth-2502.13713" aria-labelledby="oth-2502.13713">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> TALKPLAY: Multimodal Music Recommendation with Large Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Doh,+S">Seungheon Doh</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Choi,+K">Keunwoo Choi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Nam,+J">Juhan Nam</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Information Retrieval (cs.IR)</span>; Sound (cs.SD); Audio and Speech Processing (eess.AS) </div> </div> </dd> <dt> <a name='item18'>[18]</a> <a href ="/abs/2502.13574" title="Abstract" id="2502.13574"> arXiv:2502.13574 </a> (cross-list from eess.IV) [<a href="/pdf/2502.13574" title="Download PDF" id="pdf-2502.13574" aria-labelledby="pdf-2502.13574">pdf</a>, <a href="https://arxiv.org/html/2502.13574v1" title="View HTML" id="html-2502.13574" aria-labelledby="html-2502.13574" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.13574" title="Other formats" id="oth-2502.13574" aria-labelledby="oth-2502.13574">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> RestoreGrad: Signal Restoration Using Conditional Denoising Diffusion Models with Jointly Learned Prior </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Lee,+C">Ching-Hua Lee</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Yang,+C">Chouchang Yang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Cho,+J">Jaejin Cho</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Saidutta,+Y+M">Yashas Malur Saidutta</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Sharma,+R">Rakshith Sharma Srinivasa</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Shen,+Y">Yilin Shen</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Jin,+H">Hongxia Jin</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Image and Video Processing (eess.IV)</span>; Machine Learning (cs.LG); Audio and Speech Processing (eess.AS) </div> </div> </dd> <dt> <a name='item19'>[19]</a> <a href ="/abs/2502.13440" title="Abstract" id="2502.13440"> arXiv:2502.13440 </a> (cross-list from cs.SD) [<a href="/pdf/2502.13440" title="Download PDF" id="pdf-2502.13440" aria-labelledby="pdf-2502.13440">pdf</a>, <a href="https://arxiv.org/html/2502.13440v1" title="View HTML" id="html-2502.13440" aria-labelledby="html-2502.13440" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.13440" title="Other formats" id="oth-2502.13440" aria-labelledby="oth-2502.13440">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Semi-supervised classification of bird vocalizations </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Hexeberg,+S">Simen Hexeberg</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Chitre,+M">Mandar Chitre</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Hoffmann-Kuhnt,+M">Matthias Hoffmann-Kuhnt</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Low,+B+W">Bing Wen Low</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Sound (cs.SD)</span>; Artificial Intelligence (cs.AI); Computer Vision and Pattern Recognition (cs.CV); Audio and Speech Processing (eess.AS); Quantitative Methods (q-bio.QM) </div> </div> </dd> <dt> <a name='item20'>[20]</a> <a href ="/abs/2502.13433" title="Abstract" id="2502.13433"> arXiv:2502.13433 </a> (cross-list from cs.SD) [<a href="/pdf/2502.13433" title="Download PDF" id="pdf-2502.13433" aria-labelledby="pdf-2502.13433">pdf</a>, <a href="https://arxiv.org/html/2502.13433v2" title="View HTML" id="html-2502.13433" aria-labelledby="html-2502.13433" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.13433" title="Other formats" id="oth-2502.13433" aria-labelledby="oth-2502.13433">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> MATS: An Audio Language Model under Text-only Supervision </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wang,+W">Wen Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Hou,+R">Ruibing Hou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Chang,+H">Hong Chang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Shan,+S">Shiguang Shan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Chen,+X">Xilin Chen</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 19 pages,11 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Sound (cs.SD)</span>; Audio and Speech Processing (eess.AS) </div> </div> </dd> <dt> <a name='item21'>[21]</a> <a href ="/abs/2502.13395" title="Abstract" id="2502.13395"> arXiv:2502.13395 </a> (cross-list from cs.SD) [<a href="/pdf/2502.13395" title="Download PDF" id="pdf-2502.13395" aria-labelledby="pdf-2502.13395">pdf</a>, <a href="/format/2502.13395" title="Other formats" id="oth-2502.13395" aria-labelledby="oth-2502.13395">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Unsupervised CP-UNet Framework for Denoising DAS Data with Decay Noise </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Huang,+T">Tianye Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Li,+A">Aopeng Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Li,+X">Xiang Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhang,+J">Jing Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Xian,+S">Sijing Xian</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhang,+Q">Qi Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Lu,+M">Mingkong Lu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Chen,+G">Guodong Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Xiong,+L">Liangming Xiong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Hu,+X">Xiangyun Hu</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 13 pages, 8 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Sound (cs.SD)</span>; Machine Learning (cs.LG); Audio and Speech Processing (eess.AS); Signal Processing (eess.SP); Optics (physics.optics) </div> </div> </dd> </dl> <dl id='articles'> <h3>Wed, 19 Feb 2025 (showing 3 of 3 entries )</h3> <dt> <a name='item22'>[22]</a> <a href ="/abs/2502.12489" title="Abstract" id="2502.12489"> arXiv:2502.12489 </a> [<a href="/pdf/2502.12489" title="Download PDF" id="pdf-2502.12489" aria-labelledby="pdf-2502.12489">pdf</a>, <a href="/format/2502.12489" title="Other formats" id="oth-2502.12489" aria-labelledby="oth-2502.12489">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> A Comprehensive Survey on Generative AI for Video-to-Music Generation </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Ji,+S">Shulei Ji</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Wu,+S">Songruoyao Wu</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Wang,+Z">Zihao Wang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Li,+S">Shuyu Li</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Zhang,+K">Kejun Zhang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span>; Artificial Intelligence (cs.AI); Multimedia (cs.MM) </div> </div> </dd> <dt> <a name='item23'>[23]</a> <a href ="/abs/2502.12623" title="Abstract" id="2502.12623"> arXiv:2502.12623 </a> (cross-list from cs.SD) [<a href="/pdf/2502.12623" title="Download PDF" id="pdf-2502.12623" aria-labelledby="pdf-2502.12623">pdf</a>, <a href="https://arxiv.org/html/2502.12623v1" title="View HTML" id="html-2502.12623" aria-labelledby="html-2502.12623" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12623" title="Other formats" id="oth-2502.12623" aria-labelledby="oth-2502.12623">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> DeepResonance: Enhancing Multimodal Music Understanding via Music-centric Multi-way Instruction Tuning </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Mao,+Z">Zhuoyuan Mao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhao,+M">Mengjie Zhao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wu,+Q">Qiyu Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wakaki,+H">Hiromi Wakaki</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Mitsufuji,+Y">Yuki Mitsufuji</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Sound (cs.SD)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL); Multimedia (cs.MM); Audio and Speech Processing (eess.AS) </div> </div> </dd> <dt> <a name='item24'>[24]</a> <a href ="/abs/2502.12438" title="Abstract" id="2502.12438"> arXiv:2502.12438 </a> (cross-list from cs.SD) [<a href="/pdf/2502.12438" title="Download PDF" id="pdf-2502.12438" aria-labelledby="pdf-2502.12438">pdf</a>, <a href="https://arxiv.org/html/2502.12438v1" title="View HTML" id="html-2502.12438" aria-labelledby="html-2502.12438" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12438" title="Other formats" id="oth-2502.12438" aria-labelledby="oth-2502.12438">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Note-Level Singing Melody Transcription for Time-Aligned Musical Score Generation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Kim,+L">Leekyung Kim</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Jeon,+S">Sungwook Jeon</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Heo,+W">Wan Heo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Park,+J">Jonghun Park</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted by IEEE Transactions on Audio, Speech and Language Processing(TASLP) </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Sound (cs.SD)</span>; Audio and Speech Processing (eess.AS); Signal Processing (eess.SP) </div> </div> </dd> </dl> <dl id='articles'> <h3>Tue, 18 Feb 2025 (showing 16 of 16 entries )</h3> <dt> <a name='item25'>[25]</a> <a href ="/abs/2502.11572" title="Abstract" id="2502.11572"> arXiv:2502.11572 </a> [<a href="/pdf/2502.11572" title="Download PDF" id="pdf-2502.11572" aria-labelledby="pdf-2502.11572">pdf</a>, <a href="https://arxiv.org/html/2502.11572v2" title="View HTML" id="html-2502.11572" aria-labelledby="html-2502.11572" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11572" title="Other formats" id="oth-2502.11572" aria-labelledby="oth-2502.11572">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Improving Rare-Word Recognition of Whisper in Zero-Shot Settings </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Jogi,+Y">Yash Jogi</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Aggarwal,+V">Vaibhav Aggarwal</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Nair,+S+S">Shabari S Nair</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Verma,+Y">Yash Verma</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Kubba,+A">Aayush Kubba</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted at IEEE SLT 2024 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span>; Sound (cs.SD) </div> </div> </dd> <dt> <a name='item26'>[26]</a> <a href ="/abs/2502.11462" title="Abstract" id="2502.11462"> arXiv:2502.11462 </a> [<a href="/pdf/2502.11462" title="Download PDF" id="pdf-2502.11462" aria-labelledby="pdf-2502.11462">pdf</a>, <a href="https://arxiv.org/html/2502.11462v1" title="View HTML" id="html-2502.11462" aria-labelledby="html-2502.11462" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11462" title="Other formats" id="oth-2502.11462" aria-labelledby="oth-2502.11462">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> LMFCA-Net: A Lightweight Model for Multi-Channel Speech Enhancement with Efficient Narrow-Band and Cross-Band Attention </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Zhang,+Y">Yaokai Zhang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Pei,+H">Hanchen Pei</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Wang,+W">Wanqi Wang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Huang,+G">Gongping Huang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted at ICASSP 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span>; Machine Learning (cs.LG); Sound (cs.SD) </div> </div> </dd> <dt> <a name='item27'>[27]</a> <a href ="/abs/2502.11219" title="Abstract" id="2502.11219"> arXiv:2502.11219 </a> [<a href="/pdf/2502.11219" title="Download PDF" id="pdf-2502.11219" aria-labelledby="pdf-2502.11219">pdf</a>, <a href="https://arxiv.org/html/2502.11219v1" title="View HTML" id="html-2502.11219" aria-labelledby="html-2502.11219" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11219" title="Other formats" id="oth-2502.11219" aria-labelledby="oth-2502.11219">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> AudioSpa: Spatializing Sound Events with Text </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Feng,+L">Linfeng Feng</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Zhao,+L">Lei Zhao</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Zhu,+B">Boyu Zhu</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Zhang,+X">Xiao-Lei Zhang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Li,+X">Xuelong Li</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span>; Sound (cs.SD) </div> </div> </dd> <dt> <a name='item28'>[28]</a> <a href ="/abs/2502.10950" title="Abstract" id="2502.10950"> arXiv:2502.10950 </a> [<a href="/pdf/2502.10950" title="Download PDF" id="pdf-2502.10950" aria-labelledby="pdf-2502.10950">pdf</a>, <a href="https://arxiv.org/html/2502.10950v1" title="View HTML" id="html-2502.10950" aria-labelledby="html-2502.10950" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.10950" title="Other formats" id="oth-2502.10950" aria-labelledby="oth-2502.10950">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> SpeechT-RAG: Reliable Depression Detection in LLMs with Retrieval-Augmented Generation Using Speech Timing Information </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Zhang,+X">Xiangyu Zhang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Liu,+H">Hexin Liu</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Zhang,+Q">Qiquan Zhang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Ahmed,+B">Beena Ahmed</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Epps,+J">Julien Epps</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span> </div> </div> </dd> <dt> <a name='item29'>[29]</a> <a href ="/abs/2502.10838" title="Abstract" id="2502.10838"> arXiv:2502.10838 </a> [<a href="/pdf/2502.10838" title="Download PDF" id="pdf-2502.10838" aria-labelledby="pdf-2502.10838">pdf</a>, <a href="https://arxiv.org/html/2502.10838v1" title="View HTML" id="html-2502.10838" aria-labelledby="html-2502.10838" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.10838" title="Other formats" id="oth-2502.10838" aria-labelledby="oth-2502.10838">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Generalizable speech deepfake detection via meta-learned LoRA </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Laakkonen,+J">Janne Laakkonen</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Kukanov,+I">Ivan Kukanov</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Hautam%C3%A4ki,+V">Ville Hautam盲ki</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 9 pages, 2 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span>; Machine Learning (cs.LG); Sound (cs.SD) </div> </div> </dd> <dt> <a name='item30'>[30]</a> <a href ="/abs/2502.10822" title="Abstract" id="2502.10822"> arXiv:2502.10822 </a> [<a href="/pdf/2502.10822" title="Download PDF" id="pdf-2502.10822" aria-labelledby="pdf-2502.10822">pdf</a>, <a href="https://arxiv.org/html/2502.10822v1" title="View HTML" id="html-2502.10822" aria-labelledby="html-2502.10822" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.10822" title="Other formats" id="oth-2502.10822" aria-labelledby="oth-2502.10822">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> NeuroAMP: A Novel End-to-end General Purpose Deep Neural Amplifier for Personalized Hearing Aids </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Ahmed,+S">Shafique Ahmed</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Zezario,+R+E">Ryandhimas E. Zezario</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Yuan,+H">Hui-Guan Yuan</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Hussain,+A">Amir Hussain</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Wang,+H">Hsin-Min Wang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Chung,+W">Wei-Ho Chung</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Tsao,+Y">Yu Tsao</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span>; Artificial Intelligence (cs.AI); Sound (cs.SD) </div> </div> </dd> <dt> <a name='item31'>[31]</a> <a href ="/abs/2502.10511" title="Abstract" id="2502.10511"> arXiv:2502.10511 </a> [<a href="/pdf/2502.10511" title="Download PDF" id="pdf-2502.10511" aria-labelledby="pdf-2502.10511">pdf</a>, <a href="https://arxiv.org/html/2502.10511v1" title="View HTML" id="html-2502.10511" aria-labelledby="html-2502.10511" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.10511" title="Other formats" id="oth-2502.10511" aria-labelledby="oth-2502.10511">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Enhancing Age-Related Robustness in Children Speaker Verification </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Shetty,+V+M">Vishwas M. Shetty</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Zheng,+J">Jiusi Zheng</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Lulich,+S+M">Steven M. Lulich</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Alwan,+A">Abeer Alwan</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted to ICASSP 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span>; Sound (cs.SD) </div> </div> </dd> <dt> <a name='item32'>[32]</a> <a href ="/abs/2502.10447" title="Abstract" id="2502.10447"> arXiv:2502.10447 </a> [<a href="/pdf/2502.10447" title="Download PDF" id="pdf-2502.10447" aria-labelledby="pdf-2502.10447">pdf</a>, <a href="https://arxiv.org/html/2502.10447v1" title="View HTML" id="html-2502.10447" aria-labelledby="html-2502.10447" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.10447" title="Other formats" id="oth-2502.10447" aria-labelledby="oth-2502.10447">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> MoHAVE: Mixture of Hierarchical Audio-Visual Experts for Robust Speech Recognition </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Kim,+S">Sungnyun Kim</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Jang,+K">Kangwook Jang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Bae,+S">Sangmin Bae</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Cho,+S">Sungwoo Cho</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Yun,+S">Se-Young Yun</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Preliminary work </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span>; Computation and Language (cs.CL); Machine Learning (cs.LG) </div> </div> </dd> <dt> <a name='item33'>[33]</a> <a href ="/abs/2502.10426" title="Abstract" id="2502.10426"> arXiv:2502.10426 </a> [<a href="/pdf/2502.10426" title="Download PDF" id="pdf-2502.10426" aria-labelledby="pdf-2502.10426">pdf</a>, <a href="/format/2502.10426" title="Other formats" id="oth-2502.10426" aria-labelledby="oth-2502.10426">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Musical Score Following using Statistical Inference </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Cowley,+J">Josephine Cowley</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span>; Sound (cs.SD); Signal Processing (eess.SP) </div> </div> </dd> <dt> <a name='item34'>[34]</a> <a href ="/abs/2502.12002" title="Abstract" id="2502.12002"> arXiv:2502.12002 </a> (cross-list from cs.SD) [<a href="/pdf/2502.12002" title="Download PDF" id="pdf-2502.12002" aria-labelledby="pdf-2502.12002">pdf</a>, <a href="https://arxiv.org/html/2502.12002v1" title="View HTML" id="html-2502.12002" aria-labelledby="html-2502.12002" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.12002" title="Other formats" id="oth-2502.12002" aria-labelledby="oth-2502.12002">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> NaturalL2S: End-to-End High-quality Multispeaker Lip-to-Speech Synthesis with Differential Digital Signal Processing </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Liang,+Y">Yifan Liang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Liu,+F">Fangkun Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Li,+A">Andong Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Li,+X">Xiaodong Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zheng,+C">Chengshi Zheng</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Sound (cs.SD)</span>; Computer Vision and Pattern Recognition (cs.CV); Audio and Speech Processing (eess.AS) </div> </div> </dd> <dt> <a name='item35'>[35]</a> <a href ="/abs/2502.11946" title="Abstract" id="2502.11946"> arXiv:2502.11946 </a> (cross-list from cs.CL) [<a href="/pdf/2502.11946" title="Download PDF" id="pdf-2502.11946" aria-labelledby="pdf-2502.11946">pdf</a>, <a href="https://arxiv.org/html/2502.11946v2" title="View HTML" id="html-2502.11946" aria-labelledby="html-2502.11946" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11946" title="Other formats" id="oth-2502.11946" aria-labelledby="oth-2502.11946">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Step-Audio: Unified Understanding and Generation in Intelligent Speech Interaction </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Huang,+A">Ailin Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wu,+B">Boyong Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wang,+B">Bruce Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Yan,+C">Chao Yan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Hu,+C">Chen Hu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Feng,+C">Chengli Feng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Tian,+F">Fei Tian</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Shen,+F">Feiyu Shen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Li,+J">Jingbei Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Chen,+M">Mingrui Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Liu,+P">Peng Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Miao,+R">Ruihang Miao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=You,+W">Wang You</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Chen,+X">Xi Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Yang,+X">Xuerui Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Huang,+Y">Yechang Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhang,+Y">Yuxiang Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Gong,+Z">Zheng Gong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhang,+Z">Zixin Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhou,+H">Hongyu Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Sun,+J">Jianjian Sun</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Li,+B">Brian Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Feng,+C">Chengting Feng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wan,+C">Changyi Wan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Hu,+H">Hanpeng Hu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wu,+J">Jianchang Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhen,+J">Jiangjie Zhen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Ming,+R">Ranchen Ming</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Yuan,+S">Song Yuan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhang,+X">Xuelin Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhou,+Y">Yu Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Li,+B">Bingxin Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Ma,+B">Buyun Ma</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wang,+H">Hongyuan Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=An,+K">Kang An</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Ji,+W">Wei Ji</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Li,+W">Wen Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wen,+X">Xuan Wen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Kong,+X">Xiangwen Kong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Ma,+Y">Yuankai Ma</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Liang,+Y">Yuanwei Liang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Mou,+Y">Yun Mou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Ahmidi,+B">Bahtiyar Ahmidi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wang,+B">Bin Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Li,+B">Bo Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Miao,+C">Changxin Miao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Xu,+C">Chen Xu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wang,+C">Chenrun Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Shi,+D">Dapeng Shi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Sun,+D">Deshan Sun</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Hu,+D">Dingyuan Hu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Sai,+D">Dula Sai</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Liu,+E">Enle Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Huang,+G">Guanzhe Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Yan,+G">Gulin Yan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wang,+H">Heng Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Jia,+H">Haonan Jia</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhang,+H">Haoyang Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Gong,+J">Jiahao Gong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Guo,+J">Junjing Guo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Liu,+J">Jiashuai Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Liu,+J">Jiahong Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Feng,+J">Jie Feng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wu,+J">Jie Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wu,+J">Jiaoren Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Yang,+J">Jie Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wang,+J">Jinguo Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhang,+J">Jingyang Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Lin,+J">Junzhe Lin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Li,+K">Kaixiang Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Xia,+L">Lei Xia</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhou,+L">Li Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhao,+L">Liang Zhao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Gu,+L">Longlong Gu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Chen,+M">Mei Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wu,+M">Menglin Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Li,+M">Ming Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Li,+M">Mingxiao Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Li,+M">Mingliang Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Liang,+M">Mingyao Liang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wang,+N">Na Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Hao,+N">Nie Hao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wu,+Q">Qiling Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Tan,+Q">Qinyuan Tan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Sun,+R">Ran Sun</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Shuai,+S">Shuai Shuai</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Pang,+S">Shaoliang Pang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Yang,+S">Shiliang Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Gao,+S">Shuli Gao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Yuan,+S">Shanshan Yuan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Liu,+S">Siqi Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Deng,+S">Shihong Deng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Jiang,+S">Shilei Jiang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Liu,+S">Sitong Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Cao,+T">Tiancheng Cao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wang,+T">Tianyu Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Deng,+W">Wenjin Deng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Xie,+W">Wuxun Xie</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Ming,+W">Weipeng Ming</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=He,+W">Wenqing He</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Human-Computer Interaction (cs.HC); Sound (cs.SD); Audio and Speech Processing (eess.AS) </div> </div> </dd> <dt> <a name='item36'>[36]</a> <a href ="/abs/2502.11478" title="Abstract" id="2502.11478"> arXiv:2502.11478 </a> (cross-list from cs.SD) [<a href="/pdf/2502.11478" title="Download PDF" id="pdf-2502.11478" aria-labelledby="pdf-2502.11478">pdf</a>, <a href="https://arxiv.org/html/2502.11478v1" title="View HTML" id="html-2502.11478" aria-labelledby="html-2502.11478" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11478" title="Other formats" id="oth-2502.11478" aria-labelledby="oth-2502.11478">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> TAPS: Throat and Acoustic Paired Speech Dataset for Deep Learning-Based Speech Enhancement </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Kim,+Y">Yunsik Kim</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Song,+Y">Yonghun Song</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Chung,+Y">Yoonyoung Chung</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Sound (cs.SD)</span>; Machine Learning (cs.LG); Audio and Speech Processing (eess.AS) </div> </div> </dd> <dt> <a name='item37'>[37]</a> <a href ="/abs/2502.11128" title="Abstract" id="2502.11128"> arXiv:2502.11128 </a> (cross-list from cs.CL) [<a href="/pdf/2502.11128" title="Download PDF" id="pdf-2502.11128" aria-labelledby="pdf-2502.11128">pdf</a>, <a href="https://arxiv.org/html/2502.11128v1" title="View HTML" id="html-2502.11128" aria-labelledby="html-2502.11128" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.11128" title="Other formats" id="oth-2502.11128" aria-labelledby="oth-2502.11128">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> FELLE: Autoregressive Speech Synthesis with Token-Wise Coarse-to-Fine Flow Matching </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wang,+H">Hui Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Liu,+S">Shujie Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Meng,+L">Lingwei Meng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Li,+J">Jinyu Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Yang,+Y">Yifan Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhao,+S">Shiwan Zhao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Sun,+H">Haiyang Sun</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Liu,+Y">Yanqing Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Sun,+H">Haoqin Sun</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Zhou,+J">Jiaming Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Lu,+Y">Yan Lu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Qin,+Y">Yong Qin</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Sound (cs.SD); Audio and Speech Processing (eess.AS) </div> </div> </dd> <dt> <a name='item38'>[38]</a> <a href ="/abs/2502.10718" title="Abstract" id="2502.10718"> arXiv:2502.10718 </a> (cross-list from cs.SD) [<a href="/pdf/2502.10718" title="Download PDF" id="pdf-2502.10718" aria-labelledby="pdf-2502.10718">pdf</a>, <a href="https://arxiv.org/html/2502.10718v1" title="View HTML" id="html-2502.10718" aria-labelledby="html-2502.10718" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.10718" title="Other formats" id="oth-2502.10718" aria-labelledby="oth-2502.10718">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Hyperdimensional Intelligent Sensing for Efficient Real-Time Audio Processing on Extreme Edge </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Yun,+S">Sanggeon Yun</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Masukawa,+R">Ryozo Masukawa</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Chen,+H">Hanning Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Jeong,+S">SungHeon Jeong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Huang,+W">Wenjun Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Rezvani,+A">Arghavan Rezvani</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Na,+M">Minhyoung Na</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Yamaguchi,+Y">Yoshiki Yamaguchi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Imani,+M">Mohsen Imani</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted to IEEE Access </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Sound (cs.SD)</span>; Artificial Intelligence (cs.AI); Audio and Speech Processing (eess.AS) </div> </div> </dd> <dt> <a name='item39'>[39]</a> <a href ="/abs/2502.10491" title="Abstract" id="2502.10491"> arXiv:2502.10491 </a> (cross-list from cs.SD) [<a href="/pdf/2502.10491" title="Download PDF" id="pdf-2502.10491" aria-labelledby="pdf-2502.10491">pdf</a>, <a href="https://arxiv.org/html/2502.10491v1" title="View HTML" id="html-2502.10491" aria-labelledby="html-2502.10491" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.10491" title="Other formats" id="oth-2502.10491" aria-labelledby="oth-2502.10491">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> F-StrIPE: Fast Structure-Informed Positional Encoding for Symbolic Music Generation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Agarwal,+M">Manvi Agarwal</a> (IP Paris, LTCI, IDS), <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wang,+C">Changhong Wang</a> (LTCI), <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Richard,+G">Gael Richard</a> (S2A, IDS)</div> <div class='list-journal-ref'><span class='descriptor'>Journal-ref:</span> IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), Apr 2025, Hyderabad, India </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Sound (cs.SD)</span>; Artificial Intelligence (cs.AI); Machine Learning (cs.LG); Audio and Speech Processing (eess.AS) </div> </div> </dd> <dt> <a name='item40'>[40]</a> <a href ="/abs/2502.10467" title="Abstract" id="2502.10467"> arXiv:2502.10467 </a> (cross-list from cs.SD) [<a href="/pdf/2502.10467" title="Download PDF" id="pdf-2502.10467" aria-labelledby="pdf-2502.10467">pdf</a>, <a href="https://arxiv.org/html/2502.10467v1" title="View HTML" id="html-2502.10467" aria-labelledby="html-2502.10467" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.10467" title="Other formats" id="oth-2502.10467" aria-labelledby="oth-2502.10467">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> YNote: A Novel Music Notation for Fine-Tuning LLMs in Music Generation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Lu,+S">Shao-Chien Lu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Yeh,+C">Chen-Chen Yeh</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Cho,+H">Hui-Lin Cho</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Hsu,+C">Chun-Chieh Hsu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Hsu,+T">Tsai-Ling Hsu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wu,+C">Cheng-Han Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Shih,+T+K">Timothy K. Shih</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Lin,+Y">Yu-Cheng Lin</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Sound (cs.SD)</span>; Artificial Intelligence (cs.AI); Audio and Speech Processing (eess.AS) </div> </div> </dd> </dl> <dl id='articles'> <h3>Mon, 17 Feb 2025 (showing 10 of 10 entries )</h3> <dt> <a name='item41'>[41]</a> <a href ="/abs/2502.09859" title="Abstract" id="2502.09859"> arXiv:2502.09859 </a> [<a href="/pdf/2502.09859" title="Download PDF" id="pdf-2502.09859" aria-labelledby="pdf-2502.09859">pdf</a>, <a href="https://arxiv.org/html/2502.09859v1" title="View HTML" id="html-2502.09859" aria-labelledby="html-2502.09859" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.09859" title="Other formats" id="oth-2502.09859" aria-labelledby="oth-2502.09859">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Microphone Array Geometry Independent Multi-Talker Distant ASR: NTT System for the DASR Task of the CHiME-8 Challenge </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Kamo,+N">Naoyuki Kamo</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Tawara,+N">Naohiro Tawara</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Ando,+A">Atsushi Ando</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Kano,+T">Takatomo Kano</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Sato,+H">Hiroshi Sato</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Ikeshita,+R">Rintaro Ikeshita</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Moriya,+T">Takafumi Moriya</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Horiguch,+S">Shota Horiguch</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Matsuura,+K">Kohei Matsuura</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Ogawa,+A">Atsunori Ogawa</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Plaquet,+A">Alexis Plaquet</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Ashihara,+T">Takanori Ashihara</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Ochiai,+T">Tsubasa Ochiai</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Mimura,+M">Masato Mimura</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Delcroix,+M">Marc Delcroix</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Nakatani,+T">Tomohiro Nakatani</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Asami,+T">Taichi Asami</a>, <a href="https://arxiv.org/search/eess?searchtype=author&amp;query=Araki,+S">Shoko Araki</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 55 pages, 12 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span>; Signal Processing (eess.SP) </div> </div> </dd> <dt> <a name='item42'>[42]</a> <a href ="/abs/2502.10373" title="Abstract" id="2502.10373"> arXiv:2502.10373 </a> (cross-list from cs.CL) [<a href="/pdf/2502.10373" title="Download PDF" id="pdf-2502.10373" aria-labelledby="pdf-2502.10373">pdf</a>, <a href="https://arxiv.org/html/2502.10373v1" title="View HTML" id="html-2502.10373" aria-labelledby="html-2502.10373" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.10373" title="Other formats" id="oth-2502.10373" aria-labelledby="oth-2502.10373">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> OWLS: Scaling Laws for Multilingual Speech Recognition and Translation Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Chen,+W">William Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Tian,+J">Jinchuan Tian</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Peng,+Y">Yifan Peng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Yan,+B">Brian Yan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Yang,+C+H">Chao-Han Huck Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Watanabe,+S">Shinji Watanabe</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 23 pages, 13 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Machine Learning (cs.LG); Audio and Speech Processing (eess.AS) </div> </div> </dd> <dt> <a name='item43'>[43]</a> <a href ="/abs/2502.10362" title="Abstract" id="2502.10362"> arXiv:2502.10362 </a> (cross-list from cs.SD) [<a href="/pdf/2502.10362" title="Download PDF" id="pdf-2502.10362" aria-labelledby="pdf-2502.10362">pdf</a>, <a href="https://arxiv.org/html/2502.10362v2" title="View HTML" id="html-2502.10362" aria-labelledby="html-2502.10362" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.10362" title="Other formats" id="oth-2502.10362" aria-labelledby="oth-2502.10362">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> CLaMP 3: Universal Music Information Retrieval Across Unaligned Modalities and Unseen Languages </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Wu,+S">Shangda Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Guo,+Z">Zhancheng Guo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Yuan,+R">Ruibin Yuan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Jiang,+J">Junyan Jiang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Doh,+S">Seungheon Doh</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Xia,+G">Gus Xia</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Nam,+J">Juhan Nam</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Li,+X">Xiaobing Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Yu,+F">Feng Yu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Sun,+M">Maosong Sun</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 20 pages, 8 figures, 12 tables </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Sound (cs.SD)</span>; Audio and Speech Processing (eess.AS) </div> </div> </dd> <dt> <a name='item44'>[44]</a> <a href ="/abs/2502.10329" title="Abstract" id="2502.10329"> arXiv:2502.10329 </a> (cross-list from cs.SD) [<a href="/pdf/2502.10329" title="Download PDF" id="pdf-2502.10329" aria-labelledby="pdf-2502.10329">pdf</a>, <a href="https://arxiv.org/html/2502.10329v1" title="View HTML" id="html-2502.10329" aria-labelledby="html-2502.10329" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.10329" title="Other formats" id="oth-2502.10329" aria-labelledby="oth-2502.10329">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> VocalCrypt: Novel Active Defense Against Deepfake Voice Based on Masking Effect </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Fei,+Q">Qingyuan Fei</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Hou,+W">Wenjie Hou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Hai,+X">Xuan Hai</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Liu,+X">Xin Liu</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 9 pages, four figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Sound (cs.SD)</span>; Cryptography and Security (cs.CR); Multimedia (cs.MM); Audio and Speech Processing (eess.AS) </div> </div> </dd> <dt> <a name='item45'>[45]</a> <a href ="/abs/2502.10154" title="Abstract" id="2502.10154"> arXiv:2502.10154 </a> (cross-list from cs.SD) [<a href="/pdf/2502.10154" title="Download PDF" id="pdf-2502.10154" aria-labelledby="pdf-2502.10154">pdf</a>, <a href="https://arxiv.org/html/2502.10154v1" title="View HTML" id="html-2502.10154" aria-labelledby="html-2502.10154" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.10154" title="Other formats" id="oth-2502.10154" aria-labelledby="oth-2502.10154">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Video Soundtrack Generation by Aligning Emotions and Temporal Boundaries </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Sulun,+S">Serkan Sulun</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Viana,+P">Paula Viana</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Davies,+M+E+P">Matthew E. P. Davies</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Submitted to International Joint Conference on Artificial Intelligence (IJCAI) 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Sound (cs.SD)</span>; Artificial Intelligence (cs.AI); Machine Learning (cs.LG); Multimedia (cs.MM); Audio and Speech Processing (eess.AS); Image and Video Processing (eess.IV) </div> </div> </dd> <dt> <a name='item46'>[46]</a> <a href ="/abs/2502.10058" title="Abstract" id="2502.10058"> arXiv:2502.10058 </a> (cross-list from cs.CL) [<a href="/pdf/2502.10058" title="Download PDF" id="pdf-2502.10058" aria-labelledby="pdf-2502.10058">pdf</a>, <a href="https://arxiv.org/html/2502.10058v1" title="View HTML" id="html-2502.10058" aria-labelledby="html-2502.10058" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.10058" title="Other formats" id="oth-2502.10058" aria-labelledby="oth-2502.10058">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> MTLM: an Innovative Language Model Training Paradigm for ASR </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Meng,+Q">Qingliang Meng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Ren,+P">Pengju Ren</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Li,+T">Tian Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Dai,+C">Changsong Dai</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Audio and Speech Processing (eess.AS) </div> </div> </dd> <dt> <a name='item47'>[47]</a> <a href ="/abs/2502.10011" title="Abstract" id="2502.10011"> arXiv:2502.10011 </a> (cross-list from cs.SD) [<a href="/pdf/2502.10011" title="Download PDF" id="pdf-2502.10011" aria-labelledby="pdf-2502.10011">pdf</a>, <a href="https://arxiv.org/html/2502.10011v1" title="View HTML" id="html-2502.10011" aria-labelledby="html-2502.10011" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.10011" title="Other formats" id="oth-2502.10011" aria-labelledby="oth-2502.10011">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> InterGridNet: An Electric Network Frequency Approach for Audio Source Location Classification Using Convolutional Neural Networks </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Korgialas,+C">Christos Korgialas</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Tsingalis,+I">Ioannis Tsingalis</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Tzolopoulos,+G">Georgios Tzolopoulos</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Kotropoulos,+C">Constantine Kotropoulos</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> The 10th International Conference on Advances in Signal, Image and Video Processing (SIGNAL 2025) </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Sound (cs.SD)</span>; Machine Learning (cs.LG); Audio and Speech Processing (eess.AS) </div> </div> </dd> <dt> <a name='item48'>[48]</a> <a href ="/abs/2502.09940" title="Abstract" id="2502.09940"> arXiv:2502.09940 </a> (cross-list from cs.CL) [<a href="/pdf/2502.09940" title="Download PDF" id="pdf-2502.09940" aria-labelledby="pdf-2502.09940">pdf</a>, <a href="https://arxiv.org/html/2502.09940v1" title="View HTML" id="html-2502.09940" aria-labelledby="html-2502.09940" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.09940" title="Other formats" id="oth-2502.09940" aria-labelledby="oth-2502.09940">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> A Preliminary Exploration with GPT-4o Voice Mode </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Lin,+Y">Yu-Xiang Lin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Yang,+C">Chih-Kai Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Chen,+W">Wei-Chih Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Li,+C">Chen-An Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Huang,+C">Chien-yu Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Chen,+X">Xuanjun Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Lee,+H">Hung-yi Lee</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Work in progress </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Sound (cs.SD); Audio and Speech Processing (eess.AS) </div> </div> </dd> <dt> <a name='item49'>[49]</a> <a href ="/abs/2502.09782" title="Abstract" id="2502.09782"> arXiv:2502.09782 </a> (cross-list from cs.LG) [<a href="/pdf/2502.09782" title="Download PDF" id="pdf-2502.09782" aria-labelledby="pdf-2502.09782">pdf</a>, <a href="/format/2502.09782" title="Other formats" id="oth-2502.09782" aria-labelledby="oth-2502.09782">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Improving Acoustic Side-Channel Attacks on Keyboards Using Transformers and Large Language Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Park,+J+H">Jin Hyun Park</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Ayati,+S+A">Seyyed Ali Ayati</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Cai,+Y">Yichen Cai</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> We would like to withdraw our paper due to a significant error in the experimental methodology, which impacts the validity of our results. The error specifically affects the analysis presented in Section 4, where an incorrect dataset preprocessing step led to misleading conclusions </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL); Audio and Speech Processing (eess.AS) </div> </div> </dd> <dt> <a name='item50'>[50]</a> <a href ="/abs/2502.09661" title="Abstract" id="2502.09661"> arXiv:2502.09661 </a> (cross-list from cs.SD) [<a href="/pdf/2502.09661" title="Download PDF" id="pdf-2502.09661" aria-labelledby="pdf-2502.09661">pdf</a>, <a href="https://arxiv.org/html/2502.09661v1" title="View HTML" id="html-2502.09661" aria-labelledby="html-2502.09661" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.09661" title="Other formats" id="oth-2502.09661" aria-labelledby="oth-2502.09661">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> SIToBI -- A Speech Prosody Annotation Tool for Indian Languages </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Thinakaran,+P">Preethi Thinakaran</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Muthuramalingam,+M">Malarvizhi Muthuramalingam</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=S,+S">Sooriya S</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Gladston,+A+R">Anushiya Rachel Gladston</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Vijayalakshmi,+P">P. Vijayalakshmi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Murthy,+H+A">Hema A Murthy</a>, <a href="https://arxiv.org/search/cs?searchtype=author&amp;query=Nagarajan,+T">T. Nagarajan</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Sound (cs.SD)</span>; Audio and Speech Processing (eess.AS) </div> </div> </dd> </dl> <div class='paging'>Total of 50 entries </div> <div class='morefewer'>Showing up to 50 entries per page: <a href=/list/eess.AS/recent?skip=0&amp;show=25 rel="nofollow"> fewer</a> | <span style="color: #454545">more</span> | <span style="color: #454545">all</span> </div> </div> </div> </div> </main> <footer style="clear: both;"> <div class="columns is-desktop" role="navigation" aria-label="Secondary" style="margin: -0.75em -0.75em 0.75em -0.75em"> <!-- Macro-Column 1 --> <div class="column" style="padding: 0;"> <div class="columns"> <div class="column"> <ul style="list-style: none; line-height: 2;"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul style="list-style: none; line-height: 2;"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- End Macro-Column 1 --> <!-- Macro-Column 2 --> <div class="column" style="padding: 0;"> <div class="columns"> <div class="column"> <ul style="list-style: none; line-height: 2;"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul style="list-style: none; line-height: 2;"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> <!-- End Macro-Column 2 --> </div> </footer> </div> <script src="/static/base/1.0.1/js/member_acknowledgement.js"></script> </body> </html>

Pages: 1 2 3 4 5 6 7 8 9 10