CINXE.COM
Sound
<!DOCTYPE html> <html lang="en"> <head> <title>Sound </title> <meta name="viewport" content="width=device-width, initial-scale=1"> <link rel="apple-touch-icon" sizes="180x180" href="/static/browse/0.3.4/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="/static/browse/0.3.4/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="/static/browse/0.3.4/images/icons/favicon-16x16.png"> <link rel="manifest" href="/static/browse/0.3.4/images/icons/site.webmanifest"> <link rel="mask-icon" href="/static/browse/0.3.4/images/icons/safari-pinned-tab.svg" color="#5bbad5"> <meta name="msapplication-TileColor" content="#da532c"> <meta name="theme-color" content="#ffffff"> <link rel="stylesheet" type="text/css" media="screen" href="/static/browse/0.3.4/css/arXiv.css?v=20241206" /> <link rel="stylesheet" type="text/css" media="print" href="/static/browse/0.3.4/css/arXiv-print.css?v=20200611" /> <link rel="stylesheet" type="text/css" media="screen" href="/static/browse/0.3.4/css/browse_search.css" /> <script language="javascript" src="/static/browse/0.3.4/js/accordion.js" /></script> <script src="/static/browse/0.3.4/js/mathjaxToggle.min.js" type="text/javascript"></script> <script type="text/javascript" language="javascript">mathjaxToggle();</script> </head> <body class="with-cu-identity"> <div class="flex-wrap-footer"> <header> <a href="#content" class="is-sr-only">Skip to main content</a> <!-- start desktop header --> <div class="columns is-vcentered is-hidden-mobile" id="cu-identity"> <div class="column" id="cu-logo"> <a href="https://www.cornell.edu/"><img src="/static/browse/0.3.4/images/icons/cu/cornell-reduced-white-SMALL.svg" alt="Cornell University" /></a> </div><div class="column" id="support-ack"> <span id="support-ack-url">We gratefully acknowledge support from the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors.</span> <a href="https://info.arxiv.org/about/donate.html" class="btn-header-donate">Donate</a> </div> </div> <div id="header" class="is-hidden-mobile"> <a aria-hidden="true" tabindex="-1" href="/IgnoreMe"></a> <div class="header-breadcrumbs"> <a href="/"><img src="/static/browse/0.3.4/images/arxiv-logo-one-color-white.svg" alt="arxiv logo" style="height:40px;"/></a> <span>></span> <a href="/list/cs.SD/recent">cs.SD</a> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div><!-- /end desktop header --> <div class="mobile-header"> <div class="columns is-mobile"> <div class="column logo-arxiv"><a href="https://arxiv.org/"><img src="/static/browse/0.3.4/images/arxiv-logomark-small-white.svg" alt="arXiv logo" style="height:60px;" /></a></div> <div class="column logo-cornell"><a href="https://www.cornell.edu/"> <picture> <source media="(min-width: 501px)" srcset="/static/browse/0.3.4/images/icons/cu/cornell-reduced-white-SMALL.svg 400w" sizes="400w" /> <source srcset="/static/browse/0.3.4/images/icons/cu/cornell_seal_simple_black.svg 2x" /> <img src="/static/browse/0.3.4/images/icons/cu/cornell-reduced-white-SMALL.svg" alt="Cornell University Logo" /> </picture> </a></div> <div class="column nav" id="toggle-container" role="menubar"> <button class="toggle-control"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-white"><title>open search</title><path d="M505 442.7L405.3 343c-4.5-4.5-10.6-7-17-7H372c27.6-35.3 44-79.7 44-128C416 93.1 322.9 0 208 0S0 93.1 0 208s93.1 208 208 208c48.3 0 92.7-16.4 128-44v16.3c0 6.4 2.5 12.5 7 17l99.7 99.7c9.4 9.4 24.6 9.4 33.9 0l28.3-28.3c9.4-9.4 9.4-24.6.1-34zM208 336c-70.7 0-128-57.2-128-128 0-70.7 57.2-128 128-128 70.7 0 128 57.2 128 128 0 70.7-57.2 128-128 128z"/></svg></button> <div class="mobile-toggle-block toggle-target"> <form class="mobile-search-form" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <input class="input" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <input type="hidden" name="source" value="header"> <input type="hidden" name="searchtype" value="all"> <button class="button">GO</button> </div> </form> </div> <button class="toggle-control"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-white" role="menu"><title>open navigation menu</title><path d="M16 132h416c8.837 0 16-7.163 16-16V76c0-8.837-7.163-16-16-16H16C7.163 60 0 67.163 0 76v40c0 8.837 7.163 16 16 16zm0 160h416c8.837 0 16-7.163 16-16v-40c0-8.837-7.163-16-16-16H16c-8.837 0-16 7.163-16 16v40c0 8.837 7.163 16 16 16zm0 160h416c8.837 0 16-7.163 16-16v-40c0-8.837-7.163-16-16-16H16c-8.837 0-16 7.163-16 16v40c0 8.837 7.163 16 16 16z"/ ></svg></button> <div class="mobile-toggle-block toggle-target"> <nav class="mobile-menu" aria-labelledby="mobilemenulabel"> <h2 id="mobilemenulabel">quick links</h2> <ul> <li><a href="https://arxiv.org/login">Login</a></li> <li><a href="https://info.arxiv.org/help">Help Pages</a></li> <li><a href="https://info.arxiv.org/about">About</a></li> </ul> </nav> </div> </div> </div> </div><!-- /end mobile-header --> </header> <main> <div id="content"> <div id='content-inner'> <div id='dlpage'> <h1>Sound</h1> <ul> <li><a href="#item0">New submissions</a></li> <li><a href="#item6">Cross-lists</a></li> <li><a href="#item7">Replacements</a></li> </ul> <p>See <a id="recent-cs.SD" aria-labelledby="recent-cs.SD" href="/list/cs.SD/recent">recent</a> articles</p> <h3>Showing new listings for Monday, 17 February 2025</h3> <div class='paging'>Total of 11 entries </div> <div class='morefewer'>Showing up to 2000 entries per page: <a href=/list/cs.SD/new?skip=0&show=1000 rel="nofollow"> fewer</a> | <span style="color: #454545">more</span> | <span style="color: #454545">all</span> </div> <dl id='articles'> <h3>New submissions (showing 5 of 5 entries)</h3> <dt> <a name='item1'>[1]</a> <a href ="/abs/2502.09661" title="Abstract" id="2502.09661"> arXiv:2502.09661 </a> [<a href="/pdf/2502.09661" title="Download PDF" id="pdf-2502.09661" aria-labelledby="pdf-2502.09661">pdf</a>, <a href="https://arxiv.org/html/2502.09661v1" title="View HTML" id="html-2502.09661" aria-labelledby="html-2502.09661" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.09661" title="Other formats" id="oth-2502.09661" aria-labelledby="oth-2502.09661">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> SIToBI - A Speech Prosody Annotation Tool for Indian Languages </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Thinakaran,+P">Preethi Thinakaran</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Muthuramalingam,+M">Malarvizhi Muthuramalingam</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=S,+S">Sooriya S</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gladston,+A+R">Anushiya Rachel Gladston</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Vijayalakshmi,+P">P. Vijayalakshmi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Murthy,+H+A">Hema A Murthy</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Nagarajan,+T">T. Nagarajan</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Sound (cs.SD)</span>; Audio and Speech Processing (eess.AS) </div> <p class='mathjax'> The availability of prosodic information from speech signals is useful in a wide range of applications. However, deriving this information from speech signals can be a laborious task involving manual intervention. Therefore, the current work focuses on developing a tool that can provide prosodic annotations corresponding to a given speech signal, particularly for Indian languages. The proposed Segmentation with Intensity, Tones and Break Indices (SIToBI) tool provides time-aligned phoneme, syllable, and word transcriptions, syllable-level pitch contour annotations, break indices, and syllable-level relative intensity indices. The tool focuses more on syllable-level annotations since Indian languages are syllable-timed. Indians, regardless of the language they speak, may exhibit influences from other languages. As a result, other languages spoken in India may also exhibit syllable-timed characteristics. The accuracy of the annotations derived from the tool is analyzed by comparing them against manual annotations and the tool is observed to perform well. While the current work focuses on three languages, namely, Tamil, Hindi, and Indian English, the tool can easily be extended to other Indian languages and possibly other syllable-timed languages as well. </p> </div> </dd> <dt> <a name='item2'>[2]</a> <a href ="/abs/2502.10011" title="Abstract" id="2502.10011"> arXiv:2502.10011 </a> [<a href="/pdf/2502.10011" title="Download PDF" id="pdf-2502.10011" aria-labelledby="pdf-2502.10011">pdf</a>, <a href="https://arxiv.org/html/2502.10011v1" title="View HTML" id="html-2502.10011" aria-labelledby="html-2502.10011" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.10011" title="Other formats" id="oth-2502.10011" aria-labelledby="oth-2502.10011">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> InterGridNet: An Electric Network Frequency Approach for Audio Source Location Classification Using Convolutional Neural Networks </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Korgialas,+C">Christos Korgialas</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tsingalis,+I">Ioannis Tsingalis</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tzolopoulos,+G">Georgios Tzolopoulos</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kotropoulos,+C">Constantine Kotropoulos</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> The 10th International Conference on Advances in Signal, Image and Video Processing (SIGNAL 2025) </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Sound (cs.SD)</span>; Machine Learning (cs.LG); Audio and Speech Processing (eess.AS) </div> <p class='mathjax'> A novel framework, called InterGridNet, is introduced, leveraging a shallow RawNet model for geolocation classification of Electric Network Frequency (ENF) signatures in the SP Cup 2016 dataset. During data preparation, recordings are sorted into audio and power groups based on inherent characteristics, further divided into 50 Hz and 60 Hz groups via spectrogram analysis. Residual blocks within the classification model extract frame-level embeddings, aiding decision-making through softmax activation. The topology and the hyperparameters of the shallow RawNet are optimized using a Neural Architecture Search. The overall accuracy of InterGridNet in the test recordings is 92%, indicating its effectiveness against the state-of-the-art methods tested in the SP Cup 2016. These findings underscore InterGridNet's effectiveness in accurately classifying audio recordings from diverse power grids, advancing state-of-the-art geolocation estimation methods. </p> </div> </dd> <dt> <a name='item3'>[3]</a> <a href ="/abs/2502.10154" title="Abstract" id="2502.10154"> arXiv:2502.10154 </a> [<a href="/pdf/2502.10154" title="Download PDF" id="pdf-2502.10154" aria-labelledby="pdf-2502.10154">pdf</a>, <a href="https://arxiv.org/html/2502.10154v1" title="View HTML" id="html-2502.10154" aria-labelledby="html-2502.10154" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.10154" title="Other formats" id="oth-2502.10154" aria-labelledby="oth-2502.10154">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Video Soundtrack Generation by Aligning Emotions and Temporal Boundaries </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Sulun,+S">Serkan Sulun</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Viana,+P">Paula Viana</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Davies,+M+E+P">Matthew E. P. Davies</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Submitted to International Joint Conference on Artificial Intelligence (IJCAI) 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Sound (cs.SD)</span>; Artificial Intelligence (cs.AI); Machine Learning (cs.LG); Multimedia (cs.MM); Audio and Speech Processing (eess.AS); Image and Video Processing (eess.IV) </div> <p class='mathjax'> We introduce EMSYNC, a video-based symbolic music generation model that aligns music with a video's emotional content and temporal boundaries. It follows a two-stage framework, where a pretrained video emotion classifier extracts emotional features, and a conditional music generator produces MIDI sequences guided by both emotional and temporal cues. We introduce boundary offsets, a novel temporal conditioning mechanism that enables the model to anticipate and align musical chords with scene cuts. Unlike existing models, our approach retains event-based encoding, ensuring fine-grained timing control and expressive musical nuances. We also propose a mapping scheme to bridge the video emotion classifier, which produces discrete emotion categories, with the emotion-conditioned MIDI generator, which operates on continuous-valued valence-arousal inputs. In subjective listening tests, EMSYNC outperforms state-of-the-art models across all subjective metrics, for music theory-aware participants as well as the general listeners. </p> </div> </dd> <dt> <a name='item4'>[4]</a> <a href ="/abs/2502.10329" title="Abstract" id="2502.10329"> arXiv:2502.10329 </a> [<a href="/pdf/2502.10329" title="Download PDF" id="pdf-2502.10329" aria-labelledby="pdf-2502.10329">pdf</a>, <a href="https://arxiv.org/html/2502.10329v1" title="View HTML" id="html-2502.10329" aria-labelledby="html-2502.10329" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.10329" title="Other formats" id="oth-2502.10329" aria-labelledby="oth-2502.10329">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> VocalCrypt: Novel Active Defense Against Deepfake Voice Based on Masking Effect </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Fei,+Q">Qingyuan Fei</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hou,+W">Wenjie Hou</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hai,+X">Xuan Hai</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+X">Xin Liu</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 9 pages, four figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Sound (cs.SD)</span>; Cryptography and Security (cs.CR); Multimedia (cs.MM); Audio and Speech Processing (eess.AS) </div> <p class='mathjax'> The rapid advancements in AI voice cloning, fueled by machine learning, have significantly impacted text-to-speech (TTS) and voice conversion (VC) fields. While these developments have led to notable progress, they have also raised concerns about the misuse of AI VC technology, causing economic losses and negative public perceptions. To address this challenge, this study focuses on creating active defense mechanisms against AI VC systems. <br>We propose a novel active defense method, VocalCrypt, which embeds pseudo-timbre (jamming information) based on SFS into audio segments that are imperceptible to the human ear, thereby forming systematic fragments to prevent voice cloning. This approach protects the voice without compromising its quality. In comparison to existing methods, such as adversarial noise incorporation, VocalCrypt significantly enhances robustness and real-time performance, achieving a 500\% increase in generation speed while maintaining interference effectiveness. <br>Unlike audio watermarking techniques, which focus on post-detection, our method offers preemptive defense, reducing implementation costs and enhancing feasibility. Extensive experiments using the Zhvoice and VCTK Corpus datasets show that our AI-cloned speech defense system performs excellently in automatic speaker verification (ASV) tests while preserving the integrity of the protected audio. </p> </div> </dd> <dt> <a name='item5'>[5]</a> <a href ="/abs/2502.10362" title="Abstract" id="2502.10362"> arXiv:2502.10362 </a> [<a href="/pdf/2502.10362" title="Download PDF" id="pdf-2502.10362" aria-labelledby="pdf-2502.10362">pdf</a>, <a href="https://arxiv.org/html/2502.10362v1" title="View HTML" id="html-2502.10362" aria-labelledby="html-2502.10362" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.10362" title="Other formats" id="oth-2502.10362" aria-labelledby="oth-2502.10362">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> CLaMP 3: Universal Music Information Retrieval Across Unaligned Modalities and Unseen Languages </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Wu,+S">Shangda Wu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Guo,+Z">Zhancheng Guo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yuan,+R">Ruibin Yuan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jiang,+J">Junyan Jiang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Doh,+S">Seungheon Doh</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xia,+G">Gus Xia</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Nam,+J">Juhan Nam</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+X">Xiaobing Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yu,+F">Feng Yu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sun,+M">Maosong Sun</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 20 pages, 8 figures, 12 tables </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Sound (cs.SD)</span>; Audio and Speech Processing (eess.AS) </div> <p class='mathjax'> CLaMP 3 is a unified framework developed to address challenges of cross-modal and cross-lingual generalization in music information retrieval. Using contrastive learning, it aligns all major music modalities--including sheet music, performance signals, and audio recordings--with multilingual text in a shared representation space, enabling retrieval across unaligned modalities with text as a bridge. It features a multilingual text encoder adaptable to unseen languages, exhibiting strong cross-lingual generalization. Leveraging retrieval-augmented generation, we curated M4-RAG, a web-scale dataset consisting of 2.31 million music-text pairs. This dataset is enriched with detailed metadata that represents a wide array of global musical traditions. To advance future research, we release WikiMT-X, a benchmark comprising 1,000 triplets of sheet music, audio, and richly varied text descriptions. Experiments show that CLaMP 3 achieves state-of-the-art performance on multiple MIR tasks, significantly surpassing previous strong baselines and demonstrating excellent generalization in multimodal and multilingual music contexts. </p> </div> </dd> </dl> <dl id='articles'> <h3>Cross submissions (showing 1 of 1 entries)</h3> <dt> <a name='item6'>[6]</a> <a href ="/abs/2502.09940" title="Abstract" id="2502.09940"> arXiv:2502.09940 </a> (cross-list from cs.CL) [<a href="/pdf/2502.09940" title="Download PDF" id="pdf-2502.09940" aria-labelledby="pdf-2502.09940">pdf</a>, <a href="https://arxiv.org/html/2502.09940v1" title="View HTML" id="html-2502.09940" aria-labelledby="html-2502.09940" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.09940" title="Other formats" id="oth-2502.09940" aria-labelledby="oth-2502.09940">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> A Preliminary Exploration with GPT-4o Voice Mode </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Lin,+Y">Yu-Xiang Lin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+C">Chih-Kai Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+W">Wei-Chih Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+C">Chen-An Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Huang,+C">Chien-yu Huang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chen,+X">Xuanjun Chen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lee,+H">Hung-yi Lee</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Work in progress </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Sound (cs.SD); Audio and Speech Processing (eess.AS) </div> <p class='mathjax'> With the rise of multimodal large language models, GPT-4o stands out as a pioneering model, driving us to evaluate its capabilities. This report assesses GPT-4o across various tasks to analyze its audio processing and reasoning abilities. We find that GPT-4o exhibits strong knowledge in audio, speech, and music understanding, performing well in tasks like intent classification, spoken command classification, semantic and grammatical reasoning., multilingual speech recognition, and singing analysis. It also shows greater robustness against hallucinations than other large audio-language models (LALMs). However, it struggles with tasks such as audio duration prediction and instrument classification. Additionally, GPT-4o's safety mechanisms cause it to decline tasks like speaker identification, age classification, MOS prediction, and audio deepfake detection. Notably, the model exhibits a significantly different refusal rate when responding to speaker verification tasks on different datasets. This is likely due to variations in the accompanying instructions or the quality of the input audio, suggesting the sensitivity of its built-in safeguards. Finally, we acknowledge that model performance varies with evaluation protocols. This report only serves as a preliminary exploration of the current state of LALMs. </p> </div> </dd> </dl> <dl id='articles'> <h3>Replacement submissions (showing 5 of 5 entries)</h3> <dt> <a name='item7'>[7]</a> <a href ="/abs/2409.11731" title="Abstract" id="2409.11731"> arXiv:2409.11731 </a> (replaced) [<a href="/pdf/2409.11731" title="Download PDF" id="pdf-2409.11731" aria-labelledby="pdf-2409.11731">pdf</a>, <a href="https://arxiv.org/html/2409.11731v3" title="View HTML" id="html-2409.11731" aria-labelledby="html-2409.11731" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2409.11731" title="Other formats" id="oth-2409.11731" aria-labelledby="oth-2409.11731">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Performance and Robustness of Signal-Dependent vs. Signal-Independent Binaural Signal Matching with Wearable Microphone Arrays </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&query=Berger,+A">Ami Berger</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Tourbabin,+V">Vladimir Tourbabin</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Donley,+J">Jacob Donley</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Ben-Hur,+Z">Zamir Ben-Hur</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Rafaely,+B">Boaz Rafaely</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span>; Sound (cs.SD) </div> <p class='mathjax'> The increasing popularity of spatial audio in applications such as teleconferencing, entertainment, and virtual reality has led to the recent developments of binaural reproduction methods. However, only a few of these methods are well-suited for wearable and mobile arrays, which typically consist of a small number of microphones. One such method is binaural signal matching (BSM), which has been shown to produce high-quality binaural signals for wearable arrays. However, BSM may be suboptimal in cases of high direct-to-reverberant ratio (DRR) as it is based on the diffuse sound field assumption. To overcome this limitation, previous studies incorporated sound-field models other than diffuse. However, performance may be sensitive to signal estimation errors. This paper aims to provide a systematic and comprehensive analysis of signal-dependent vs. signal-independent BSM, so that the benefits and limitations of the methods become clearer. Two signal-dependent BSM-based methods designed for high DRR scenarios that incorporate a sound field model composed of direct and reverberant components are investigated mathematically, using simulations, and finally validated by a listening test, and compared to the signal-independent BSM. The results show that signal-dependent BSM can significantly improve performance, in particular in the direction of the source, while presenting only a negligible degradation in other directions. Furthermore, when source direction estimation is inaccurate, performance of of the signal-dependent BSM degrade to equal that of the signal-independent BSM, presenting a desired robustness quality. </p> </div> </dd> <dt> <a name='item8'>[8]</a> <a href ="/abs/2410.05101" title="Abstract" id="2410.05101"> arXiv:2410.05101 </a> (replaced) [<a href="/pdf/2410.05101" title="Download PDF" id="pdf-2410.05101" aria-labelledby="pdf-2410.05101">pdf</a>, <a href="https://arxiv.org/html/2410.05101v4" title="View HTML" id="html-2410.05101" aria-labelledby="html-2410.05101" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2410.05101" title="Other formats" id="oth-2410.05101" aria-labelledby="oth-2410.05101">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> CR-CTC: Consistency regularization on CTC for improved speech recognition </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&query=Yao,+Z">Zengwei Yao</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Kang,+W">Wei Kang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Yang,+X">Xiaoyu Yang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Kuang,+F">Fangjun Kuang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Guo,+L">Liyong Guo</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Zhu,+H">Han Zhu</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Jin,+Z">Zengrui Jin</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Li,+Z">Zhaoqing Li</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Lin,+L">Long Lin</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Povey,+D">Daniel Povey</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Published as a conference paper at ICLR 2025 </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span>; Machine Learning (cs.LG); Sound (cs.SD) </div> <p class='mathjax'> Connectionist Temporal Classification (CTC) is a widely used method for automatic speech recognition (ASR), renowned for its simplicity and computational efficiency. However, it often falls short in recognition performance. In this work, we propose the Consistency-Regularized CTC (CR-CTC), which enforces consistency between two CTC distributions obtained from different augmented views of the input speech mel-spectrogram. We provide in-depth insights into its essential behaviors from three perspectives: 1) it conducts self-distillation between random pairs of sub-models that process different augmented views; 2) it learns contextual representation through masked prediction for positions within time-masked regions, especially when we increase the amount of time masking; 3) it suppresses the extremely peaky CTC distributions, thereby reducing overfitting and improving the generalization ability. Extensive experiments on LibriSpeech, Aishell-1, and GigaSpeech datasets demonstrate the effectiveness of our CR-CTC. It significantly improves the CTC performance, achieving state-of-the-art results comparable to those attained by transducer or systems combining CTC and attention-based encoder-decoder (CTC/AED). We release our code at <a href="https://github.com/k2-fsa/icefall" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> <dt> <a name='item9'>[9]</a> <a href ="/abs/2501.04116" title="Abstract" id="2501.04116"> arXiv:2501.04116 </a> (replaced) [<a href="/pdf/2501.04116" title="Download PDF" id="pdf-2501.04116" aria-labelledby="pdf-2501.04116">pdf</a>, <a href="https://arxiv.org/html/2501.04116v2" title="View HTML" id="html-2501.04116" aria-labelledby="html-2501.04116" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2501.04116" title="Other formats" id="oth-2501.04116" aria-labelledby="oth-2501.04116">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Artifact-free Sound Quality in DNN-based Closed-loop Systems for Audio Processing </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&query=Wen,+C">Chuan Wen</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Torfs,+G">Guy Torfs</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Verhulst,+S">Sarah Verhulst</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span>; Sound (cs.SD) </div> <p class='mathjax'> Recent advances in deep neural networks (DNNs) have significantly improved various audio processing applications, including speech enhancement, synthesis, and hearing aid algorithms. DNN-based closed-loop systems have gained popularity in these applications due to their robust performance and ability to adapt to diverse conditions. Despite their effectiveness, current DNN-based closed-loop systems often suffer from sound quality degradation caused by artifacts introduced by suboptimal sampling methods. To address this challenge, we introduce dCoNNear, a novel DNN architecture designed for seamless integration into closed-loop frameworks. This architecture specifically aims to prevent the generation of spurious artifacts. We demonstrate the effectiveness of dCoNNear through a proof-of-principle example within a closed-loop framework that employs biophysically realistic models of auditory processing for both normal and hearing-impaired profiles to design personalized hearing aid algorithms. Our results show that dCoNNear not only accurately simulates all processing stages of existing non-DNN biophysical models but also eliminates audible artifacts, thereby enhancing the sound quality of the resulting hearing aid algorithms. This study presents a novel, artifact-free closed-loop framework that improves the sound quality of audio processing systems, offering a promising solution for high-fidelity applications in audio and hearing technologies. </p> </div> </dd> <dt> <a name='item10'>[10]</a> <a href ="/abs/2502.03930" title="Abstract" id="2502.03930"> arXiv:2502.03930 </a> (replaced) [<a href="/pdf/2502.03930" title="Download PDF" id="pdf-2502.03930" aria-labelledby="pdf-2502.03930">pdf</a>, <a href="https://arxiv.org/html/2502.03930v2" title="View HTML" id="html-2502.03930" aria-labelledby="html-2502.03930" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.03930" title="Other formats" id="oth-2502.03930" aria-labelledby="oth-2502.03930">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> DiTAR: Diffusion Transformer Autoregressive Modeling for Speech Generation </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&query=Jia,+D">Dongya Jia</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Chen,+Z">Zhuo Chen</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Chen,+J">Jiawei Chen</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Du,+C">Chenpeng Du</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Wu,+J">Jian Wu</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Cong,+J">Jian Cong</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Zhuang,+X">Xiaobin Zhuang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Li,+C">Chumin Li</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Wei,+Z">Zhen Wei</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Wang,+Y">Yuping Wang</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Wang,+Y">Yuxuan Wang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 16 pages, 8 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span>; Artificial Intelligence (cs.AI); Computation and Language (cs.CL); Machine Learning (cs.LG); Sound (cs.SD) </div> <p class='mathjax'> Several recent studies have attempted to autoregressively generate continuous speech representations without discrete speech tokens by combining diffusion and autoregressive models, yet they often face challenges with excessive computational loads or suboptimal outcomes. In this work, we propose Diffusion Transformer Autoregressive Modeling (DiTAR), a patch-based autoregressive framework combining a language model with a diffusion transformer. This approach significantly enhances the efficacy of autoregressive models for continuous tokens and reduces computational demands. DiTAR utilizes a divide-and-conquer strategy for patch generation, where the language model processes aggregated patch embeddings and the diffusion transformer subsequently generates the next patch based on the output of the language model. For inference, we propose defining temperature as the time point of introducing noise during the reverse diffusion ODE to balance diversity and determinism. We also show in the extensive scaling analysis that DiTAR has superb scalability. In zero-shot speech generation, DiTAR achieves state-of-the-art performance in robustness, speaker similarity, and naturalness. </p> </div> </dd> <dt> <a name='item11'>[11]</a> <a href ="/abs/2502.05674" title="Abstract" id="2502.05674"> arXiv:2502.05674 </a> (replaced) [<a href="/pdf/2502.05674" title="Download PDF" id="pdf-2502.05674" aria-labelledby="pdf-2502.05674">pdf</a>, <a href="/format/2502.05674" title="Other formats" id="oth-2502.05674" aria-labelledby="oth-2502.05674">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Less is More for Synthetic Speech Detection in the Wild </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&query=Garg,+A">Ashi Garg</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Cai,+Z">Zexin Cai</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Xinyuan,+H+L">Henry Li Xinyuan</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Garc%C3%ADa-Perera,+L+P">Leibny Paola Garc铆a-Perera</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Duh,+K">Kevin Duh</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Khudanpur,+S">Sanjeev Khudanpur</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Wiesner,+M">Matthew Wiesner</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Andrews,+N">Nicholas Andrews</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span>; Sound (cs.SD) </div> <p class='mathjax'> Driven by advances in self-supervised learning for speech, state-of-the-art synthetic speech detectors have achieved low error rates on popular benchmarks such as ASVspoof. However, prior benchmarks do not address the wide range of real-world variability in speech. Are reported error rates realistic in real-world conditions? To assess detector failure modes and robustness under controlled distribution shifts, we introduce ShiftySpeech, a benchmark with more than 3000 hours of synthetic speech from 7 domains, 6 TTS systems, 12 vocoders, and 3 languages. We found that all distribution shifts degraded model performance, and contrary to prior findings, training on more vocoders, speakers, or with data augmentation did not guarantee better generalization. In fact, we found that training on less diverse data resulted in better generalization, and that a detector fit using samples from a single carefully selected vocoder and a small number of speakers, without data augmentations, achieved state-of-the-art results on the challenging In-the-Wild benchmark. </p> </div> </dd> </dl> <div class='paging'>Total of 11 entries </div> <div class='morefewer'>Showing up to 2000 entries per page: <a href=/list/cs.SD/new?skip=0&show=1000 rel="nofollow"> fewer</a> | <span style="color: #454545">more</span> | <span style="color: #454545">all</span> </div> </div> </div> </div> </main> <footer style="clear: both;"> <div class="columns is-desktop" role="navigation" aria-label="Secondary" style="margin: -0.75em -0.75em 0.75em -0.75em"> <!-- Macro-Column 1 --> <div class="column" style="padding: 0;"> <div class="columns"> <div class="column"> <ul style="list-style: none; line-height: 2;"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul style="list-style: none; line-height: 2;"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- End Macro-Column 1 --> <!-- Macro-Column 2 --> <div class="column" style="padding: 0;"> <div class="columns"> <div class="column"> <ul style="list-style: none; line-height: 2;"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul style="list-style: none; line-height: 2;"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> <!-- End Macro-Column 2 --> </div> </footer> </div> <script src="/static/base/1.0.1/js/member_acknowledgement.js"></script> </body> </html>