Audio and Speech Processing

<!DOCTYPE html> <html lang="en"> <head> <title>Audio and Speech Processing </title> <meta name="viewport" content="width=device-width, initial-scale=1"> <link rel="apple-touch-icon" sizes="180x180" href="/static/browse/0.3.4/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="/static/browse/0.3.4/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="/static/browse/0.3.4/images/icons/favicon-16x16.png"> <link rel="manifest" href="/static/browse/0.3.4/images/icons/site.webmanifest"> <link rel="mask-icon" href="/static/browse/0.3.4/images/icons/safari-pinned-tab.svg" color="#5bbad5"> <meta name="msapplication-TileColor" content="#da532c"> <meta name="theme-color" content="#ffffff"> <link rel="stylesheet" type="text/css" media="screen" href="/static/browse/0.3.4/css/arXiv.css?v=20241206" /> <link rel="stylesheet" type="text/css" media="print" href="/static/browse/0.3.4/css/arXiv-print.css?v=20200611" /> <link rel="stylesheet" type="text/css" media="screen" href="/static/browse/0.3.4/css/browse_search.css" /> <script language="javascript" src="/static/browse/0.3.4/js/accordion.js" /></script> <script src="/static/browse/0.3.4/js/mathjaxToggle.min.js" type="text/javascript"></script> <script type="text/javascript" language="javascript">mathjaxToggle();</script> </head> <body class="with-cu-identity"> <div class="flex-wrap-footer"> <header> <a href="#content" class="is-sr-only">Skip to main content</a>  <div class="columns is-vcentered is-hidden-mobile" id="cu-identity"> <div class="column" id="cu-logo"> <a href="https://www.cornell.edu/"><img src="/static/browse/0.3.4/images/icons/cu/cornell-reduced-white-SMALL.svg" alt="Cornell University" /></a> </div><div class="column" id="support-ack"> <span id="support-ack-url">We gratefully acknowledge support from the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors.</span> <a href="https://info.arxiv.org/about/donate.html" class="btn-header-donate">Donate</a> </div> </div> <div id="header" class="is-hidden-mobile"> <a aria-hidden="true" tabindex="-1" href="/IgnoreMe"></a> <div class="header-breadcrumbs"> <a href="/"><img src="/static/browse/0.3.4/images/arxiv-logo-one-color-white.svg" alt="arxiv logo" style="height:40px;"/></a> <span>></span> <a href="/list/eess.AS/recent">eess.AS</a> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <div class="mobile-header"> <div class="columns is-mobile"> <div class="column logo-arxiv"><a href="https://arxiv.org/"><img src="/static/browse/0.3.4/images/arxiv-logomark-small-white.svg" alt="arXiv logo" style="height:60px;" /></a></div> <div class="column logo-cornell"><a href="https://www.cornell.edu/"> <picture> <source media="(min-width: 501px)" srcset="/static/browse/0.3.4/images/icons/cu/cornell-reduced-white-SMALL.svg 400w" sizes="400w" /> <source srcset="/static/browse/0.3.4/images/icons/cu/cornell_seal_simple_black.svg 2x" /> <img src="/static/browse/0.3.4/images/icons/cu/cornell-reduced-white-SMALL.svg" alt="Cornell University Logo" /> </picture> </a></div> <div class="column nav" id="toggle-container" role="menubar"> <button class="toggle-control"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-white"><title>open search</title><path d="M505 442.7L405.3 343c-4.5-4.5-10.6-7-17-7H372c27.6-35.3 44-79.7 44-128C416 93.1 322.9 0 208 0S0 93.1 0 208s93.1 208 208 208c48.3 0 92.7-16.4 128-44v16.3c0 6.4 2.5 12.5 7 17l99.7 99.7c9.4 9.4 24.6 9.4 33.9 0l28.3-28.3c9.4-9.4 9.4-24.6.1-34zM208 336c-70.7 0-128-57.2-128-128 0-70.7 57.2-128 128-128 70.7 0 128 57.2 128 128 0 70.7-57.2 128-128 128z"/></svg></button> <div class="mobile-toggle-block toggle-target"> <form class="mobile-search-form" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <input class="input" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <input type="hidden" name="source" value="header"> <input type="hidden" name="searchtype" value="all"> <button class="button">GO</button> </div> </form> </div> <button class="toggle-control"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-white" role="menu"><title>open navigation menu</title><path d="M16 132h416c8.837 0 16-7.163 16-16V76c0-8.837-7.163-16-16-16H16C7.163 60 0 67.163 0 76v40c0 8.837 7.163 16 16 16zm0 160h416c8.837 0 16-7.163 16-16v-40c0-8.837-7.163-16-16-16H16c-8.837 0-16 7.163-16 16v40c0 8.837 7.163 16 16 16zm0 160h416c8.837 0 16-7.163 16-16v-40c0-8.837-7.163-16-16-16H16c-8.837 0-16 7.163-16 16v40c0 8.837 7.163 16 16 16z"/ ></svg></button> <div class="mobile-toggle-block toggle-target"> <nav class="mobile-menu" aria-labelledby="mobilemenulabel"> <h2 id="mobilemenulabel">quick links</h2> <ul> <li><a href="https://arxiv.org/login">Login</a></li> <li><a href="https://info.arxiv.org/help">Help Pages</a></li> <li><a href="https://info.arxiv.org/about">About</a></li> </ul> </nav> </div> </div> </div> </div> </header> <main> <div id="content"> <div id='content-inner'> <div id='dlpage'> <h1>Audio and Speech Processing</h1> <ul> <li><a href="#item0">New submissions</a></li> <li><a href="#item3">Cross-lists</a></li> <li><a href="#item7">Replacements</a></li> </ul> <p>See <a id="recent-eess.AS" aria-labelledby="recent-eess.AS" href="/list/eess.AS/recent">recent</a> articles</p> <h3>Showing new listings for Thursday, 20 March 2025</h3> <div class='paging'>Total of 11 entries </div> <div class='morefewer'>Showing up to 2000 entries per page: <a href=/list/eess.AS/new?skip=0&show=1000 rel="nofollow"> fewer</a> | <span style="color: #454545">more</span> | <span style="color: #454545">all</span> </div> <dl id='articles'> <h3>New submissions (showing 2 of 2 entries)</h3> <dt> <a name='item1'>[1]</a> <a href ="/abs/2503.14854" title="Abstract" id="2503.14854"> arXiv:2503.14854 </a> [<a href="/pdf/2503.14854" title="Download PDF" id="pdf-2503.14854" aria-labelledby="pdf-2503.14854">pdf</a>, <a href="https://arxiv.org/html/2503.14854v1" title="View HTML" id="html-2503.14854" aria-labelledby="html-2503.14854" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.14854" title="Other formats" id="oth-2503.14854" aria-labelledby="oth-2503.14854">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Analysis and Extension of Noisy-target Training for Unsupervised Target Signal Enhancement </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&query=Fujimura,+T">Takuya Fujimura</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Toda,+T">Tomoki Toda</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span> </div> <p class='mathjax'> Deep neural network-based target signal enhancement (TSE) is usually trained in a supervised manner using clean target signals. However, collecting clean target signals is costly and such signals are not always available. Thus, it is desirable to develop an unsupervised method that does not rely on clean target signals. Among various studies on unsupervised TSE methods, Noisy-target Training (NyTT) has been established as a fundamental method. NyTT simply replaces clean target signals with noisy ones in the typical supervised training, and it has been experimentally shown to achieve TSE. Despite its effectiveness and simplicity, its mechanism and detailed behavior are still unclear. In this paper, to advance NyTT and, thus, unsupervised methods as a whole, we analyze NyTT from various perspectives. We experimentally demonstrate the mechanism of NyTT, the desirable conditions, and the effectiveness of utilizing noisy signals in situations where a small number of clean target signals are available. Furthermore, we propose an improved version of NyTT based on its properties and explore its capabilities in the dereverberation and declipping tasks, beyond the denoising task. </p> </div> </dd> <dt> <a name='item2'>[2]</a> <a href ="/abs/2503.15338" title="Abstract" id="2503.15338"> arXiv:2503.15338 </a> [<a href="/pdf/2503.15338" title="Download PDF" id="pdf-2503.15338" aria-labelledby="pdf-2503.15338">pdf</a>, <a href="https://arxiv.org/html/2503.15338v1" title="View HTML" id="html-2503.15338" aria-labelledby="html-2503.15338" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.15338" title="Other formats" id="oth-2503.15338" aria-labelledby="oth-2503.15338">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Solla: Towards a Speech-Oriented LLM That Hears Acoustic Context </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&query=Ao,+J">Junyi Ao</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Chen,+D">Dekun Chen</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Tian,+X">Xiaohai Tian</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Feng,+W">Wenjie Feng</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Zhang,+J">Jun Zhang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Lu,+L">Lu Lu</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Wang,+Y">Yuxuan Wang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Li,+H">Haizhou Li</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Wu,+Z">Zhizheng Wu</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span>; Computation and Language (cs.CL); Sound (cs.SD) </div> <p class='mathjax'> Large Language Models (LLMs) have recently shown remarkable ability to process not only text but also multimodal inputs such as speech and audio. However, most existing models primarily focus on analyzing input signals using text instructions, overlooking scenarios in which speech instructions and audio are mixed and serve as inputs to the model. To address these challenges, we introduce Solla, a novel framework designed to understand speech-based questions and hear the acoustic context concurrently. Solla incorporates an audio tagging module to effectively identify and represent audio events, as well as an ASR-assisted prediction method to improve comprehension of spoken content. To rigorously evaluate Solla and other publicly available models, we propose a new benchmark dataset called SA-Eval, which includes three tasks: audio event classification, audio captioning, and audio question answering. SA-Eval has diverse speech instruction with various speaking styles, encompassing two difficulty levels, easy and hard, to capture the range of real-world acoustic conditions. Experimental results show that Solla performs on par with or outperforms baseline models on both the easy and hard test sets, underscoring its effectiveness in jointly understanding speech and audio. </p> </div> </dd> </dl> <dl id='articles'> <h3>Cross submissions (showing 4 of 4 entries)</h3> <dt> <a name='item3'>[3]</a> <a href ="/abs/2503.14545" title="Abstract" id="2503.14545"> arXiv:2503.14545 </a> (cross-list from cs.LG) [<a href="/pdf/2503.14545" title="Download PDF" id="pdf-2503.14545" aria-labelledby="pdf-2503.14545">pdf</a>, <a href="https://arxiv.org/html/2503.14545v1" title="View HTML" id="html-2503.14545" aria-labelledby="html-2503.14545" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.14545" title="Other formats" id="oth-2503.14545" aria-labelledby="oth-2503.14545">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> PANDORA: Diffusion Policy Learning for Dexterous Robotic Piano Playing </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+Y">Yanjia Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+R">Renjie Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tu,+Z">Zhengzhong Tu</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Robotics (cs.RO); Sound (cs.SD); Audio and Speech Processing (eess.AS) </div> <p class='mathjax'> We present PANDORA, a novel diffusion-based policy learning framework designed specifically for dexterous robotic piano performance. Our approach employs a conditional U-Net architecture enhanced with FiLM-based global conditioning, which iteratively denoises noisy action sequences into smooth, high-dimensional trajectories. To achieve precise key execution coupled with expressive musical performance, we design a composite reward function that integrates task-specific accuracy, audio fidelity, and high-level semantic feedback from a large language model (LLM) oracle. The LLM oracle assesses musical expressiveness and stylistic nuances, enabling dynamic, hand-specific reward adjustments. Further augmented by a residual inverse-kinematics refinement policy, PANDORA achieves state-of-the-art performance in the ROBOPIANIST environment, significantly outperforming baselines in both precision and expressiveness. Ablation studies validate the critical contributions of diffusion-based denoising and LLM-driven semantic feedback in enhancing robotic musicianship. Videos available at: <a href="https://taco-group.github.io/PANDORA" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </p> </div> </dd> <dt> <a name='item4'>[4]</a> <a href ="/abs/2503.14928" title="Abstract" id="2503.14928"> arXiv:2503.14928 </a> (cross-list from cs.CV) [<a href="/pdf/2503.14928" title="Download PDF" id="pdf-2503.14928" aria-labelledby="pdf-2503.14928">pdf</a>, <a href="https://arxiv.org/html/2503.14928v1" title="View HTML" id="html-2503.14928" aria-labelledby="html-2503.14928" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.14928" title="Other formats" id="oth-2503.14928" aria-labelledby="oth-2503.14928">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Shushing! Let's Imagine an Authentic Speech from the Silent Video </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Ye,+J">Jiaxin Ye</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Shan,+H">Hongming Shan</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Project Page: <a href="https://imagintalk.github.io" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Artificial Intelligence (cs.AI); Sound (cs.SD); Audio and Speech Processing (eess.AS) </div> <p class='mathjax'> Vision-guided speech generation aims to produce authentic speech from facial appearance or lip motions without relying on auditory signals, offering significant potential for applications such as dubbing in filmmaking and assisting individuals with aphonia. Despite recent progress, existing methods struggle to achieve unified cross-modal alignment across semantics, timbre, and emotional prosody from visual cues, prompting us to propose Consistent Video-to-Speech (CV2S) as an extended task to enhance cross-modal consistency. To tackle emerging challenges, we introduce ImaginTalk, a novel cross-modal diffusion framework that generates faithful speech using only visual input, operating within a discrete space. Specifically, we propose a discrete lip aligner that predicts discrete speech tokens from lip videos to capture semantic information, while an error detector identifies misaligned tokens, which are subsequently refined through masked language modeling with BERT. To further enhance the expressiveness of the generated speech, we develop a style diffusion transformer equipped with a face-style adapter that adaptively customizes identity and prosody dynamics across both the channel and temporal dimensions while ensuring synchronization with lip-aware semantic features. Extensive experiments demonstrate that ImaginTalk can generate high-fidelity speech with more accurate semantic details and greater expressiveness in timbre and emotion compared to state-of-the-art baselines. Demos are shown at our project page: <a href="https://imagintalk.github.io" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item5'>[5]</a> <a href ="/abs/2503.15074" title="Abstract" id="2503.15074"> arXiv:2503.15074 </a> (cross-list from cs.SD) [<a href="/pdf/2503.15074" title="Download PDF" id="pdf-2503.15074" aria-labelledby="pdf-2503.15074">pdf</a>, <a href="https://arxiv.org/html/2503.15074v1" title="View HTML" id="html-2503.15074" aria-labelledby="html-2503.15074" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.15074" title="Other formats" id="oth-2503.15074" aria-labelledby="oth-2503.15074">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> InsectSet459: an open dataset of insect sounds for bioacoustic machine learning </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Fai%C3%9F,+M">Marius Fai脽</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ghani,+B">Burooj Ghani</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Stowell,+D">Dan Stowell</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Sound (cs.SD)</span>; Audio and Speech Processing (eess.AS) </div> <p class='mathjax'> Automatic recognition of insect sound could help us understand changing biodiversity trends around the world -- but insect sounds are challenging to recognize even for deep learning. We present a new dataset comprised of 26399 audio files, from 459 species of Orthoptera and Cicadidae. It is the first large-scale dataset of insect sound that is easily applicable for developing novel deep-learning methods. Its recordings were made with a variety of audio recorders using varying sample rates to capture the extremely broad range of frequencies that insects produce. We benchmark performance with two state-of-the-art deep learning classifiers, demonstrating good performance but also significant room for improvement in acoustic insect classification. This dataset can serve as a realistic test case for implementing insect monitoring workflows, and as a challenging basis for the development of audio representation methods that can handle highly variable frequencies and/or sample rates. </p> </div> </dd> <dt> <a name='item6'>[6]</a> <a href ="/abs/2503.15164" title="Abstract" id="2503.15164"> arXiv:2503.15164 </a> (cross-list from eess.SP) [<a href="/pdf/2503.15164" title="Download PDF" id="pdf-2503.15164" aria-labelledby="pdf-2503.15164">pdf</a>, <a href="https://arxiv.org/html/2503.15164v1" title="View HTML" id="html-2503.15164" aria-labelledby="html-2503.15164" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.15164" title="Other formats" id="oth-2503.15164" aria-labelledby="oth-2503.15164">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Gridless Chirp Parameter Retrieval via Constrained Two-Dimensional Atomic Norm Minimization </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&query=Yang,+D">Dehui Yang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Xi,+F">Feng Xi</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Signal Processing (eess.SP)</span>; Audio and Speech Processing (eess.AS) </div> <p class='mathjax'> This paper is concerned with the fundamental problem of estimating chirp parameters from a mixture of linear chirp signals. Unlike most previous methods, which solve the problem by discretizing the parameter space and then estimating the chirp parameters, we propose a gridless approach by reformulating the inverse problem as a constrained two-dimensional atomic norm minimization from structured measurements. This reformulation enables the direct estimation of continuous-valued parameters without discretization, thereby resolving the issue of basis mismatch. An approximate semidefinite programming (SDP) is employed to solve the proposed convex program. Additionally, a dual polynomial is constructed to certify the optimality of the atomic decomposition. Numerical simulations demonstrate that exact recovery of chirp parameters is achievable using the proposed atomic norm minimization. </p> </div> </dd> </dl> <dl id='articles'> <h3>Replacement submissions (showing 5 of 5 entries)</h3> <dt> <a name='item7'>[7]</a> <a href ="/abs/2307.07748" title="Abstract" id="2307.07748"> arXiv:2307.07748 </a> (replaced) [<a href="/pdf/2307.07748" title="Download PDF" id="pdf-2307.07748" aria-labelledby="pdf-2307.07748">pdf</a>, <a href="https://arxiv.org/html/2307.07748v2" title="View HTML" id="html-2307.07748" aria-labelledby="html-2307.07748" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2307.07748" title="Other formats" id="oth-2307.07748" aria-labelledby="oth-2307.07748">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Audio-Visual Speech Enhancement Using Self-supervised Learning to Improve Speech Intelligibility in Cochlear Implant Simulations </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&query=Lai,+R+L">Richard Lee Lai</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Hou,+J">Jen-Cheng Hou</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Chern,+I">I-Chun Chern</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Hung,+K">Kuo-Hsuan Hung</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Chen,+Y">Yi-Ting Chen</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Gogate,+M">Mandar Gogate</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Arslan,+T">Tughrul Arslan</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Hussain,+A">Amir Hussain</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Tsao,+Y">Yu Tsao</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span> </div> <p class='mathjax'> Individuals with hearing impairments face challenges in their ability to comprehend speech, particularly in noisy environments. The aim of this study is to explore the effectiveness of audio-visual speech enhancement (AVSE) in enhancing the intelligibility of vocoded speech in cochlear implant (CI) simulations. Notably, the study focuses on a challenged scenario where there is limited availability of training data for the AVSE task. To address this problem, we propose a novel deep neural network framework termed Self-Supervised Learning-based AVSE (SSL-AVSE). The proposed SSL-AVSE combines visual cues, such as lip and mouth movements, from the target speakers with corresponding audio signals. The contextually combined audio and visual data are then fed into a Transformer-based SSL AV-HuBERT model to extract features, which are further processed using a BLSTM-based SE model. The results demonstrate several key findings. Firstly, SSL-AVSE successfully overcomes the issue of limited data by leveraging the AV-HuBERT model. Secondly, by fine-tuning the AV-HuBERT model parameters for the target SE task, significant performance improvements are achieved. Specifically, there is a notable enhancement in PESQ (Perceptual Evaluation of Speech Quality) from 1.43 to 1.67 and in STOI (Short-Time Objective Intelligibility) from 0.70 to 0.74. Furthermore, the performance of the SSL-AVSE was evaluated using CI vocoded speech to assess the intelligibility for CI users. Comparative experimental outcomes reveal that in the presence of dynamic noises encountered during human conversations, SSL-AVSE exhibits a substantial improvement. The NCM (Normal Correlation Matrix) values indicate an increase of 26.5% to 87.2% compared to the noisy baseline. </p> </div> </dd> <dt> <a name='item8'>[8]</a> <a href ="/abs/2410.07982" title="Abstract" id="2410.07982"> arXiv:2410.07982 </a> (replaced) [<a href="/pdf/2410.07982" title="Download PDF" id="pdf-2410.07982" aria-labelledby="pdf-2410.07982">pdf</a>, <a href="https://arxiv.org/html/2410.07982v2" title="View HTML" id="html-2410.07982" aria-labelledby="html-2410.07982" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2410.07982" title="Other formats" id="oth-2410.07982" aria-labelledby="oth-2410.07982">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Window Function-less DFT with Reduced Noise and Latency for Real-Time Music Analysis </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&query=Biesinger,+C">Cai Biesinger</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Awano,+H">Hiromitsu Awano</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Hashimoto,+M">Masanori Hashimoto</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 5 pages, 4 figures, Submitted to EUSIPCO 2025. TeX-generated PDF exemption due to formatting problems on arXiv. This version: clarified text throughout, updated data after further optimization work, added more comparisons and a table, added references </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span> </div> <p class='mathjax'> Music analysis applications demand algorithms that can provide both high time and frequency resolution while minimizing noise in an already-noisy signal. Real-time analysis additionally demands low latency and low computational requirements. We propose a DFT-based algorithm that accomplishes all these requirements by extending a method that post-processes DFT output without the use of window functions. Our approach yields greatly reduced sidelobes and noise, and improves time resolution without sacrificing frequency resolution. We use exponentially spaced output bins which directly map to notes in music. The resulting improved performance, compared to existing FFT and DFT-based approaches, creates possibilities for improved real-time visualizations, and contributes to improved analysis quality in other applications such as automatic transcription. </p> </div> </dd> <dt> <a name='item9'>[9]</a> <a href ="/abs/2503.14345" title="Abstract" id="2503.14345"> arXiv:2503.14345 </a> (replaced) [<a href="/pdf/2503.14345" title="Download PDF" id="pdf-2503.14345" aria-labelledby="pdf-2503.14345">pdf</a>, <a href="/format/2503.14345" title="Other formats" id="oth-2503.14345" aria-labelledby="oth-2503.14345">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> MoonCast: High-Quality Zero-Shot Podcast Generation </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&query=Ju,+Z">Zeqian Ju</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Yang,+D">Dongchao Yang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Yu,+J">Jianwei Yu</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Shen,+K">Kai Shen</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Leng,+Y">Yichong Leng</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Wang,+Z">Zhengtao Wang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Tan,+X">Xu Tan</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Zhou,+X">Xinyu Zhou</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Qin,+T">Tao Qin</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Li,+X">Xiangyang Li</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL); Machine Learning (cs.LG); Sound (cs.SD) </div> <p class='mathjax'> Recent advances in text-to-speech synthesis have achieved notable success in generating high-quality short utterances for individual speakers. However, these systems still face challenges when extending their capabilities to long, multi-speaker, and spontaneous dialogues, typical of real-world scenarios such as podcasts. These limitations arise from two primary challenges: 1) long speech: podcasts typically span several minutes, exceeding the upper limit of most existing work; 2) spontaneity: podcasts are marked by their spontaneous, oral nature, which sharply contrasts with formal, written contexts; existing works often fall short in capturing this spontaneity. In this paper, we propose MoonCast, a solution for high-quality zero-shot podcast generation, aiming to synthesize natural podcast-style speech from text-only sources (e.g., stories, technical reports, news in TXT, PDF, or Web URL formats) using the voices of unseen speakers. To generate long audio, we adopt a long-context language model-based audio modeling approach utilizing large-scale long-context speech data. To enhance spontaneity, we utilize a podcast generation module to generate scripts with spontaneous details, which have been empirically shown to be as crucial as the text-to-speech modeling itself. Experiments demonstrate that MoonCast outperforms baselines, with particularly notable improvements in spontaneity and coherence. </p> </div> </dd> <dt> <a name='item10'>[10]</a> <a href ="/abs/2409.18584" title="Abstract" id="2409.18584"> arXiv:2409.18584 </a> (replaced) [<a href="/pdf/2409.18584" title="Download PDF" id="pdf-2409.18584" aria-labelledby="pdf-2409.18584">pdf</a>, <a href="https://arxiv.org/html/2409.18584v3" title="View HTML" id="html-2409.18584" aria-labelledby="html-2409.18584" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2409.18584" title="Other formats" id="oth-2409.18584" aria-labelledby="oth-2409.18584">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> ChildMandarin: A Comprehensive Mandarin Speech Dataset for Young Children Aged 3-5 </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Zhou,+J">Jiaming Zhou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+S">Shiyao Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhao,+S">Shiwan Zhao</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=He,+J">Jiabei He</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sun,+H">Haoqin Sun</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+H">Hui Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+C">Cheng Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kong,+A">Aobo Kong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Guo,+Y">Yujie Guo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+X">Xi Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Wang,+Y">Yequan Wang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lin,+Y">Yonghua Lin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Qin,+Y">Yong Qin</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Sound (cs.SD)</span>; Audio and Speech Processing (eess.AS) </div> <p class='mathjax'> Automatic speech recognition (ASR) systems have advanced significantly with models like Whisper, Conformer, and self-supervised frameworks such as Wav2vec 2.0 and HuBERT. However, developing robust ASR models for young children's speech remains challenging due to differences in pronunciation, tone, and pace compared to adult speech. In this paper, we introduce a new Mandarin speech dataset focused on children aged 3 to 5, addressing the scarcity of resources in this area. The dataset comprises 41.25 hours of speech with carefully crafted manual transcriptions, collected from 397 speakers across various provinces in China, with balanced gender representation. We provide a comprehensive analysis of speaker demographics, speech duration distribution and geographic coverage. Additionally, we evaluate ASR performance on models trained from scratch, such as Conformer, as well as fine-tuned pre-trained models like HuBERT and Whisper, where fine-tuning demonstrates significant performance improvements. Furthermore, we assess speaker verification (SV) on our dataset, showing that, despite the challenges posed by the unique vocal characteristics of young children, the dataset effectively supports both ASR and SV tasks. This dataset is a valuable contribution to Mandarin child speech research. The dataset is now open-source and freely available for all academic purposes on <a href="https://github.com/flageval-baai/ChildMandarin" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item11'>[11]</a> <a href ="/abs/2503.11197" title="Abstract" id="2503.11197"> arXiv:2503.11197 </a> (replaced) [<a href="/pdf/2503.11197" title="Download PDF" id="pdf-2503.11197" aria-labelledby="pdf-2503.11197">pdf</a>, <a href="https://arxiv.org/html/2503.11197v3" title="View HTML" id="html-2503.11197" aria-labelledby="html-2503.11197" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2503.11197" title="Other formats" id="oth-2503.11197" aria-labelledby="oth-2503.11197">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Reinforcement Learning Outperforms Supervised Fine-Tuning: A Case Study on Audio Question Answering </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+G">Gang Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+J">Jizhong Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Dinkel,+H">Heinrich Dinkel</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Niu,+Y">Yadong Niu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+J">Junbo Zhang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Luan,+J">Jian Luan</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Sound (cs.SD)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL); Audio and Speech Processing (eess.AS) </div> <p class='mathjax'> Recently, reinforcement learning (RL) has been shown to greatly enhance the reasoning capabilities of large language models (LLMs), and RL-based approaches have been progressively applied to visual multimodal tasks. However, the audio modality has largely been overlooked in these developments. Thus, we conduct a series of RL explorations in audio understanding and reasoning, specifically focusing on the audio question answering (AQA) task. We leverage the group relative policy optimization (GRPO) algorithm to Qwen2-Audio-7B-Instruct, and our experiments demonstrated state-of-the-art performance on the MMAU Test-mini benchmark, achieving an accuracy rate of 64.5%. The main findings in this technical report are as follows: 1) The GRPO algorithm can be effectively applied to large audio language models (LALMs), even when the model has only 8.2B parameters; 2) With only 38k post-training samples, RL significantly outperforms supervised fine-tuning (SFT), indicating that RL-based approaches can be effective without large datasets; 3) The explicit reasoning process has not shown significant benefits for AQA tasks, and how to efficiently utilize deep thinking remains an open question for further research; 4) LALMs still lag far behind humans auditory-language reasoning, suggesting that the RL-based approaches warrant further exploration. Our project is available at <a href="https://github.com/xiaomi-research/r1-aqa" rel="external noopener nofollow" class="link-external link-https">this https URL</a> and <a href="https://huggingface.co/mispeech/r1-aqa" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> </dl> <div class='paging'>Total of 11 entries </div> <div class='morefewer'>Showing up to 2000 entries per page: <a href=/list/eess.AS/new?skip=0&show=1000 rel="nofollow"> fewer</a> | <span style="color: #454545">more</span> | <span style="color: #454545">all</span> </div> </div> </div> </div> </main> <footer style="clear: both;"> <div class="columns is-desktop" role="navigation" aria-label="Secondary" style="margin: -0.75em -0.75em 0.75em -0.75em">  <div class="column" style="padding: 0;"> <div class="columns"> <div class="column"> <ul style="list-style: none; line-height: 2;"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul style="list-style: none; line-height: 2;"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column" style="padding: 0;"> <div class="columns"> <div class="column"> <ul style="list-style: none; line-height: 2;"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul style="list-style: none; line-height: 2;"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>   </div> </footer> </div> <script src="/static/base/1.0.1/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Audio and Speech Processing