CINXE.COM
Sound
<!DOCTYPE html> <html lang="en"> <head> <title>Sound </title> <meta name="viewport" content="width=device-width, initial-scale=1"> <link rel="apple-touch-icon" sizes="180x180" href="/static/browse/0.3.4/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="/static/browse/0.3.4/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="/static/browse/0.3.4/images/icons/favicon-16x16.png"> <link rel="manifest" href="/static/browse/0.3.4/images/icons/site.webmanifest"> <link rel="mask-icon" href="/static/browse/0.3.4/images/icons/safari-pinned-tab.svg" color="#5bbad5"> <meta name="msapplication-TileColor" content="#da532c"> <meta name="theme-color" content="#ffffff"> <link rel="stylesheet" type="text/css" media="screen" href="/static/browse/0.3.4/css/arXiv.css?v=20241206" /> <link rel="stylesheet" type="text/css" media="print" href="/static/browse/0.3.4/css/arXiv-print.css?v=20200611" /> <link rel="stylesheet" type="text/css" media="screen" href="/static/browse/0.3.4/css/browse_search.css" /> <script language="javascript" src="/static/browse/0.3.4/js/accordion.js" /></script> <script src="/static/browse/0.3.4/js/mathjaxToggle.min.js" type="text/javascript"></script> <script type="text/javascript" language="javascript">mathjaxToggle();</script> </head> <body class="with-cu-identity"> <div class="flex-wrap-footer"> <header> <a href="#content" class="is-sr-only">Skip to main content</a> <!-- start desktop header --> <div class="columns is-vcentered is-hidden-mobile" id="cu-identity"> <div class="column" id="cu-logo"> <a href="https://www.cornell.edu/"><img src="/static/browse/0.3.4/images/icons/cu/cornell-reduced-white-SMALL.svg" alt="Cornell University" /></a> </div><div class="column" id="support-ack"> <span id="support-ack-url">We gratefully acknowledge support from the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors.</span> <a href="https://info.arxiv.org/about/donate.html" class="btn-header-donate">Donate</a> </div> </div> <div id="header" class="is-hidden-mobile"> <a aria-hidden="true" tabindex="-1" href="/IgnoreMe"></a> <div class="header-breadcrumbs"> <a href="/"><img src="/static/browse/0.3.4/images/arxiv-logo-one-color-white.svg" alt="arxiv logo" style="height:40px;"/></a> <span>></span> <a href="/list/cs.SD/recent">cs.SD</a> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div><!-- /end desktop header --> <div class="mobile-header"> <div class="columns is-mobile"> <div class="column logo-arxiv"><a href="https://arxiv.org/"><img src="/static/browse/0.3.4/images/arxiv-logomark-small-white.svg" alt="arXiv logo" style="height:60px;" /></a></div> <div class="column logo-cornell"><a href="https://www.cornell.edu/"> <picture> <source media="(min-width: 501px)" srcset="/static/browse/0.3.4/images/icons/cu/cornell-reduced-white-SMALL.svg 400w" sizes="400w" /> <source srcset="/static/browse/0.3.4/images/icons/cu/cornell_seal_simple_black.svg 2x" /> <img src="/static/browse/0.3.4/images/icons/cu/cornell-reduced-white-SMALL.svg" alt="Cornell University Logo" /> </picture> </a></div> <div class="column nav" id="toggle-container" role="menubar"> <button class="toggle-control"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-white"><title>open search</title><path d="M505 442.7L405.3 343c-4.5-4.5-10.6-7-17-7H372c27.6-35.3 44-79.7 44-128C416 93.1 322.9 0 208 0S0 93.1 0 208s93.1 208 208 208c48.3 0 92.7-16.4 128-44v16.3c0 6.4 2.5 12.5 7 17l99.7 99.7c9.4 9.4 24.6 9.4 33.9 0l28.3-28.3c9.4-9.4 9.4-24.6.1-34zM208 336c-70.7 0-128-57.2-128-128 0-70.7 57.2-128 128-128 70.7 0 128 57.2 128 128 0 70.7-57.2 128-128 128z"/></svg></button> <div class="mobile-toggle-block toggle-target"> <form class="mobile-search-form" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <input class="input" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <input type="hidden" name="source" value="header"> <input type="hidden" name="searchtype" value="all"> <button class="button">GO</button> </div> </form> </div> <button class="toggle-control"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-white" role="menu"><title>open navigation menu</title><path d="M16 132h416c8.837 0 16-7.163 16-16V76c0-8.837-7.163-16-16-16H16C7.163 60 0 67.163 0 76v40c0 8.837 7.163 16 16 16zm0 160h416c8.837 0 16-7.163 16-16v-40c0-8.837-7.163-16-16-16H16c-8.837 0-16 7.163-16 16v40c0 8.837 7.163 16 16 16zm0 160h416c8.837 0 16-7.163 16-16v-40c0-8.837-7.163-16-16-16H16c-8.837 0-16 7.163-16 16v40c0 8.837 7.163 16 16 16z"/ ></svg></button> <div class="mobile-toggle-block toggle-target"> <nav class="mobile-menu" aria-labelledby="mobilemenulabel"> <h2 id="mobilemenulabel">quick links</h2> <ul> <li><a href="https://arxiv.org/login">Login</a></li> <li><a href="https://info.arxiv.org/help">Help Pages</a></li> <li><a href="https://info.arxiv.org/about">About</a></li> </ul> </nav> </div> </div> </div> </div><!-- /end mobile-header --> </header> <main> <div id="content"> <div id='content-inner'> <div id='dlpage'> <h1>Sound</h1> <ul> <li><a href="#item0">New submissions</a></li> <li><a href="#item4">Cross-lists</a></li> <li><a href="#item6">Replacements</a></li> </ul> <p>See <a id="recent-cs.SD" aria-labelledby="recent-cs.SD" href="/list/cs.SD/recent">recent</a> articles</p> <h3>Showing new listings for Monday, 7 April 2025</h3> <div class='paging'>Total of 9 entries </div> <div class='morefewer'>Showing up to 2000 entries per page: <a href=/list/cs.SD/new?skip=0&show=1000 rel="nofollow"> fewer</a> | <span style="color: #454545">more</span> | <span style="color: #454545">all</span> </div> <dl id='articles'> <h3>New submissions (showing 3 of 3 entries)</h3> <dt> <a name='item1'>[1]</a> <a href ="/abs/2504.02988" title="Abstract" id="2504.02988"> arXiv:2504.02988 </a> [<a href="/pdf/2504.02988" title="Download PDF" id="pdf-2504.02988" aria-labelledby="pdf-2504.02988">pdf</a>, <a href="https://arxiv.org/html/2504.02988v1" title="View HTML" id="html-2504.02988" aria-labelledby="html-2504.02988" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2504.02988" title="Other formats" id="oth-2504.02988" aria-labelledby="oth-2504.02988">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Generating Diverse Audio-Visual 360 Soundscapes for Sound Event Localization and Detection </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Roman,+A+S">Adrian S. Roman</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Chang,+A">Aiden Chang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Meza,+G">Gerardo Meza</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Roman,+I+R">Iran R. Roman</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Sound (cs.SD)</span>; Audio and Speech Processing (eess.AS) </div> <p class='mathjax'> We present SELDVisualSynth, a tool for generating synthetic videos for audio-visual sound event localization and detection (SELD). Our approach incorporates real-world background images to improve realism in synthetic audio-visual SELD data while also ensuring audio-visual spatial alignment. The tool creates 360 synthetic videos where objects move matching synthetic SELD audio data and its annotations. Experimental results demonstrate that a model trained with this data attains performance gains across multiple metrics, achieving superior localization recall (56.4 LR) and competitive localization error (21.9deg LE). We open-source our data generation tool for maximal use by members of the SELD research community. </p> </div> </dd> <dt> <a name='item2'>[2]</a> <a href ="/abs/2504.03289" title="Abstract" id="2504.03289"> arXiv:2504.03289 </a> [<a href="/pdf/2504.03289" title="Download PDF" id="pdf-2504.03289" aria-labelledby="pdf-2504.03289">pdf</a>, <a href="https://arxiv.org/html/2504.03289v1" title="View HTML" id="html-2504.03289" aria-labelledby="html-2504.03289" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2504.03289" title="Other formats" id="oth-2504.03289" aria-labelledby="oth-2504.03289">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> RWKVTTS: Yet another TTS based on RWKV-7 </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=yueyu,+L">Lin yueyu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Xiao,+L">Liu Xiao</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Sound (cs.SD)</span>; Computation and Language (cs.CL); Audio and Speech Processing (eess.AS) </div> <p class='mathjax'> Human-AI interaction thrives on intuitive and efficient interfaces, among which voice stands out as a particularly natural and accessible modality. Recent advancements in transformer-based text-to-speech (TTS) systems, such as Fish-Speech, CosyVoice, and MegaTTS 3, have delivered remarkable improvements in quality and realism, driving a significant evolution in the TTS domain. In this paper, we introduce RWKV-7 \cite{peng2025rwkv}, a cutting-edge RNN-based architecture tailored for TTS applications. Unlike traditional transformer models, RWKV-7 leverages the strengths of recurrent neural networks to achieve greater computational efficiency and scalability, while maintaining high-quality output. Our comprehensive benchmarks demonstrate that RWKV-7 outperforms transformer-based models across multiple key metrics, including synthesis speed, naturalness of speech, and resource efficiency. Furthermore, we explore its adaptability to diverse linguistic contexts and low-resource environments, showcasing its potential to democratize TTS technology. These findings position RWKV-7 as a powerful and innovative alternative, paving the way for more accessible and versatile voice synthesis solutions in real-world <a href="http://applications.Our" rel="external noopener nofollow" class="link-external link-http">this http URL</a> code and weights are <a href="https://github.com/yynil/RWKVTTS" rel="external noopener nofollow" class="link-external link-https">this https URL</a>, <a href="https://huggingface.co/spaces/RWKV-Red-Team" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </p> </div> </dd> <dt> <a name='item3'>[3]</a> <a href ="/abs/2504.03373" title="Abstract" id="2504.03373"> arXiv:2504.03373 </a> [<a href="/pdf/2504.03373" title="Download PDF" id="pdf-2504.03373" aria-labelledby="pdf-2504.03373">pdf</a>, <a href="https://arxiv.org/html/2504.03373v1" title="View HTML" id="html-2504.03373" aria-labelledby="html-2504.03373" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2504.03373" title="Other formats" id="oth-2504.03373" aria-labelledby="oth-2504.03373">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> An Efficient GPU-based Implementation for Noise Robust Sound Source Localization </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Lin,+Z">Zirui Lin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Takigahira,+M">Masayuki Takigahira</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Terakado,+N">Naoya Terakado</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gulzar,+H">Haris Gulzar</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Busto,+M+R">Monikka Roslianna Busto</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Eda,+T">Takeharu Eda</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Itoyama,+K">Katsutoshi Itoyama</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Nakadai,+K">Kazuhiro Nakadai</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Amano,+H">Hideharu Amano</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 6 pages, 2 figures </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Sound (cs.SD)</span>; Robotics (cs.RO); Audio and Speech Processing (eess.AS) </div> <p class='mathjax'> Robot audition, encompassing Sound Source Localization (SSL), Sound Source Separation (SSS), and Automatic Speech Recognition (ASR), enables robots and smart devices to acquire auditory capabilities similar to human hearing. Despite their wide applicability, processing multi-channel audio signals from microphone arrays in SSL involves computationally intensive matrix operations, which can hinder efficient deployment on Central Processing Units (CPUs), particularly in embedded systems with limited CPU resources. This paper introduces a GPU-based implementation of SSL for robot audition, utilizing the Generalized Singular Value Decomposition-based Multiple Signal Classification (GSVD-MUSIC), a noise-robust algorithm, within the HARK platform, an open-source software suite. For a 60-channel microphone array, the proposed implementation achieves significant performance improvements. On the Jetson AGX Orin, an embedded device powered by an NVIDIA GPU and ARM Cortex-A78AE v8.2 64-bit CPUs, we observe speedups of 4645.1x for GSVD calculations and 8.8x for the SSL module, while speedups of 2223.4x for GSVD calculation and 8.95x for the entire SSL module on a server configured with an NVIDIA A100 GPU and AMD EPYC 7352 CPUs, making real-time processing feasible for large-scale microphone arrays and providing ample capacity for real-time processing of potential subsequent machine learning or deep learning tasks. </p> </div> </dd> </dl> <dl id='articles'> <h3>Cross submissions (showing 2 of 2 entries)</h3> <dt> <a name='item4'>[4]</a> <a href ="/abs/2504.03329" title="Abstract" id="2504.03329"> arXiv:2504.03329 </a> (cross-list from eess.AS) [<a href="/pdf/2504.03329" title="Download PDF" id="pdf-2504.03329" aria-labelledby="pdf-2504.03329">pdf</a>, <a href="https://arxiv.org/html/2504.03329v1" title="View HTML" id="html-2504.03329" aria-labelledby="html-2504.03329" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2504.03329" title="Other formats" id="oth-2504.03329" aria-labelledby="oth-2504.03329">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Mind the Prompt: Prompting Strategies in Audio Generations for Improving Sound Classification </div> <div class='list-authors'><a href="https://arxiv.org/search/eess?searchtype=author&query=Ronchini,+F">Francesca Ronchini</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Wu,+H">Ho-Hsiang Wu</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Lin,+W">Wei-Cheng Lin</a>, <a href="https://arxiv.org/search/eess?searchtype=author&query=Antonacci,+F">Fabio Antonacci</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Accepted at Generative Data Augmentation for Real-World Signal Processing Applications Workshop </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Audio and Speech Processing (eess.AS)</span>; Artificial Intelligence (cs.AI); Sound (cs.SD); Signal Processing (eess.SP) </div> <p class='mathjax'> This paper investigates the design of effective prompt strategies for generating realistic datasets using Text-To-Audio (TTA) models. We also analyze different techniques for efficiently combining these datasets to enhance their utility in sound classification tasks. By evaluating two sound classification datasets with two TTA models, we apply a range of prompt strategies. Our findings reveal that task-specific prompt strategies significantly outperform basic prompt approaches in data generation. Furthermore, merging datasets generated using different TTA models proves to enhance classification performance more effectively than merely increasing the training dataset size. Overall, our results underscore the advantages of these methods as effective data augmentation techniques using synthetic data. </p> </div> </dd> <dt> <a name='item5'>[5]</a> <a href ="/abs/2504.03546" title="Abstract" id="2504.03546"> arXiv:2504.03546 </a> (cross-list from cs.CL) [<a href="/pdf/2504.03546" title="Download PDF" id="pdf-2504.03546" aria-labelledby="pdf-2504.03546">pdf</a>, <a href="https://arxiv.org/html/2504.03546v1" title="View HTML" id="html-2504.03546" aria-labelledby="html-2504.03546" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2504.03546" title="Other formats" id="oth-2504.03546" aria-labelledby="oth-2504.03546">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> MultiMed-ST: Large-scale Many-to-many Multilingual Medical Speech Translation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Le-Duc,+K">Khai Le-Duc</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tran,+T">Tuyen Tran</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tat,+B+P">Bach Phan Tat</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bui,+N+K+H">Nguyen Kim Hai Bui</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Dang,+Q">Quan Dang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tran,+H">Hung-Phong Tran</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Nguyen,+T">Thanh-Thuy Nguyen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Nguyen,+L">Ly Nguyen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Phan,+T">Tuan-Minh Phan</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tran,+T+T+P">Thi Thu Phuong Tran</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ngo,+C">Chris Ngo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Khanh,+N+X">Nguyen X. Khanh</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Nguyen-Tang,+T">Thanh Nguyen-Tang</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Preprint, 122 pages </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Machine Learning (cs.LG); Sound (cs.SD); Audio and Speech Processing (eess.AS) </div> <p class='mathjax'> Multilingual speech translation (ST) in the medical domain enhances patient care by enabling efficient communication across language barriers, alleviating specialized workforce shortages, and facilitating improved diagnosis and treatment, particularly during pandemics. In this work, we present the first systematic study on medical ST, to our best knowledge, by releasing MultiMed-ST, a large-scale ST dataset for the medical domain, spanning all translation directions in five languages: Vietnamese, English, German, French, Traditional Chinese and Simplified Chinese, together with the models. With 290,000 samples, our dataset is the largest medical machine translation (MT) dataset and the largest many-to-many multilingual ST among all domains. Secondly, we present the most extensive analysis study in ST research to date, including: empirical baselines, bilingual-multilingual comparative study, end-to-end vs. cascaded comparative study, task-specific vs. multi-task sequence-to-sequence (seq2seq) comparative study, code-switch analysis, and quantitative-qualitative error analysis. All code, data, and models are available online: <a href="https://github.com/leduckhai/MultiMed-ST" rel="external noopener nofollow" class="link-external link-https">this https URL</a>. </p> </div> </dd> </dl> <dl id='articles'> <h3>Replacement submissions (showing 4 of 4 entries)</h3> <dt> <a name='item6'>[6]</a> <a href ="/abs/2502.16936" title="Abstract" id="2502.16936"> arXiv:2502.16936 </a> (replaced) [<a href="/pdf/2502.16936" title="Download PDF" id="pdf-2502.16936" aria-labelledby="pdf-2502.16936">pdf</a>, <a href="https://arxiv.org/html/2502.16936v2" title="View HTML" id="html-2502.16936" aria-labelledby="html-2502.16936" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2502.16936" title="Other formats" id="oth-2502.16936" aria-labelledby="oth-2502.16936">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Supervised contrastive learning from weakly-labeled audio segments for musical version matching </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Serr%C3%A0,+J">Joan Serr脿</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Araz,+R+O">R. Oguz Araz</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bogdanov,+D">Dmitry Bogdanov</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mitsufuji,+Y">Yuki Mitsufuji</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> 16 pages, 6 figures, 8 tables; includes Appendix </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Sound (cs.SD)</span>; Artificial Intelligence (cs.AI); Machine Learning (cs.LG); Audio and Speech Processing (eess.AS); Machine Learning (stat.ML) </div> <p class='mathjax'> Detecting musical versions (different renditions of the same piece) is a challenging task with important applications. Because of the ground truth nature, existing approaches match musical versions at the track level (e.g., whole song). However, most applications require to match them at the segment level (e.g., 20s chunks). In addition, existing approaches resort to classification and triplet losses, disregarding more recent losses that could bring meaningful improvements. In this paper, we propose a method to learn from weakly annotated segments, together with a contrastive loss variant that outperforms well-studied alternatives. The former is based on pairwise segment distance reductions, while the latter modifies an existing loss following decoupling, hyper-parameter, and geometric considerations. With these two elements, we do not only achieve state-of-the-art results in the standard track-level evaluation, but we also obtain a breakthrough performance in a segment-level evaluation. We believe that, due to the generality of the challenges addressed here, the proposed methods may find utility in domains beyond audio or musical version matching. </p> </div> </dd> <dt> <a name='item7'>[7]</a> <a href ="/abs/2406.15888" title="Abstract" id="2406.15888"> arXiv:2406.15888 </a> (replaced) [<a href="/pdf/2406.15888" title="Download PDF" id="pdf-2406.15888" aria-labelledby="pdf-2406.15888">pdf</a>, <a href="https://arxiv.org/html/2406.15888v2" title="View HTML" id="html-2406.15888" aria-labelledby="html-2406.15888" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2406.15888" title="Other formats" id="oth-2406.15888" aria-labelledby="oth-2406.15888">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Real-time Speech Summarization for Medical Conversations </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Le-Duc,+K">Khai Le-Duc</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Nguyen,+K">Khai-Nguyen Nguyen</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Vo-Dang,+L">Long Vo-Dang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Hy,+T">Truong-Son Hy</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> Interspeech 2024 (Oral) </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Machine Learning (cs.LG); Sound (cs.SD); Audio and Speech Processing (eess.AS) </div> <p class='mathjax'> In doctor-patient conversations, identifying medically relevant information is crucial, posing the need for conversation summarization. In this work, we propose the first deployable real-time speech summarization system for real-world applications in industry, which generates a local summary after every N speech utterances within a conversation and a global summary after the end of a conversation. Our system could enhance user experience from a business standpoint, while also reducing computational costs from a technical perspective. Secondly, we present VietMed-Sum which, to our knowledge, is the first speech summarization dataset for medical conversations. Thirdly, we are the first to utilize LLM and human annotators collaboratively to create gold standard and synthetic summaries for medical conversation summarization. Finally, we present baseline results of state-of-the-art models on VietMed-Sum. All code, data (English-translated and Vietnamese) and models are available online: <a href="https://github.com/leduckhai/MultiMed/tree/master/VietMed-Sum" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </p> </div> </dd> <dt> <a name='item8'>[8]</a> <a href ="/abs/2410.15316" title="Abstract" id="2410.15316"> arXiv:2410.15316 </a> (replaced) [<a href="/pdf/2410.15316" title="Download PDF" id="pdf-2410.15316" aria-labelledby="pdf-2410.15316">pdf</a>, <a href="https://arxiv.org/html/2410.15316v3" title="View HTML" id="html-2410.15316" aria-labelledby="html-2410.15316" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2410.15316" title="Other formats" id="oth-2410.15316" aria-labelledby="oth-2410.15316">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Ichigo: Mixed-Modal Early-Fusion Realtime Voice Assistant </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Dao,+A">Alan Dao</a> (Gia Tuan Dao), <a href="https://arxiv.org/search/cs?searchtype=author&query=Vu,+D+B">Dinh Bach Vu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ha,+H+H">Huy Hoang Ha</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computation and Language (cs.CL)</span>; Sound (cs.SD); Audio and Speech Processing (eess.AS) </div> <p class='mathjax'> Large Language Models (LLMs) have revolutionized natural language processing, but their application to speech-based tasks remains challenging due to the complexities of integrating audio and text modalities. This paper introduces Ichigo, a mixed-modal model that seamlessly processes interleaved sequences of speech and text. Utilizing a tokenized early-fusion approach, Ichigo quantizes speech into discrete tokens and employs a uniform transformer-based architecture for both speech and text modalities. This method enables joint reasoning and generation across modalities without the need for separate adapters. We present a comprehensive training methodology, including pre-training on multilingual speech recognition datasets and fine-tuning on a curated instruction dataset. Ichigo demonstrates state-of-the-art performance on speech question-answering benchmarks, outperforming existing open-source speech language models and achieving comparable results to cascaded systems. Notably, Ichigo exhibits a latency of just 111 ms to first token generation, significantly lower than current models. Our approach not only advances the field of multimodal AI but also provides a framework for smaller research teams to contribute effectively to open-source speech-language models. </p> </div> </dd> <dt> <a name='item9'>[9]</a> <a href ="/abs/2412.16915" title="Abstract" id="2412.16915"> arXiv:2412.16915 </a> (replaced) [<a href="/pdf/2412.16915" title="Download PDF" id="pdf-2412.16915" aria-labelledby="pdf-2412.16915">pdf</a>, <a href="https://arxiv.org/html/2412.16915v2" title="View HTML" id="html-2412.16915" aria-labelledby="html-2412.16915" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2412.16915" title="Other formats" id="oth-2412.16915" aria-labelledby="oth-2412.16915">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> FADA: Fast Diffusion Avatar Synthesis with Mixed-Supervised Multi-CFG Distillation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Zhong,+T">Tianyun Zhong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Liang,+C">Chao Liang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Jiang,+J">Jianwen Jiang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Lin,+G">Gaojie Lin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yang,+J">Jiaqi Yang</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhao,+Z">Zhou Zhao</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> CVPR 2025, Homepage <a href="https://fadavatar.github.io/" rel="external noopener nofollow" class="link-external link-https">this https URL</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Computer Vision and Pattern Recognition (cs.CV)</span>; Artificial Intelligence (cs.AI); Graphics (cs.GR); Sound (cs.SD); Audio and Speech Processing (eess.AS) </div> <p class='mathjax'> Diffusion-based audio-driven talking avatar methods have recently gained attention for their high-fidelity, vivid, and expressive results. However, their slow inference speed limits practical applications. Despite the development of various distillation techniques for diffusion models, we found that naive diffusion distillation methods do not yield satisfactory results. Distilled models exhibit reduced robustness with open-set input images and a decreased correlation between audio and video compared to teacher models, undermining the advantages of diffusion models. To address this, we propose FADA (Fast Diffusion Avatar Synthesis with Mixed-Supervised Multi-CFG Distillation). We first designed a mixed-supervised loss to leverage data of varying quality and enhance the overall model capability as well as robustness. Additionally, we propose a multi-CFG distillation with learnable tokens to utilize the correlation between audio and reference image conditions, reducing the threefold inference runs caused by multi-CFG with acceptable quality degradation. Extensive experiments across multiple datasets show that FADA generates vivid videos comparable to recent diffusion model-based methods while achieving an NFE speedup of 4.17-12.5 times. Demos are available at our webpage <a href="http://fadavatar.github.io" rel="external noopener nofollow" class="link-external link-http">this http URL</a>. </p> </div> </dd> </dl> <div class='paging'>Total of 9 entries </div> <div class='morefewer'>Showing up to 2000 entries per page: <a href=/list/cs.SD/new?skip=0&show=1000 rel="nofollow"> fewer</a> | <span style="color: #454545">more</span> | <span style="color: #454545">all</span> </div> </div> </div> </div> </main> <footer style="clear: both;"> <div class="columns is-desktop" role="navigation" aria-label="Secondary" style="margin: -0.75em -0.75em 0.75em -0.75em"> <!-- Macro-Column 1 --> <div class="column" style="padding: 0;"> <div class="columns"> <div class="column"> <ul style="list-style: none; line-height: 2;"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul style="list-style: none; line-height: 2;"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- End Macro-Column 1 --> <!-- Macro-Column 2 --> <div class="column" style="padding: 0;"> <div class="columns"> <div class="column"> <ul style="list-style: none; line-height: 2;"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul style="list-style: none; line-height: 2;"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> <!-- End Macro-Column 2 --> </div> </footer> </div> <script src="/static/base/1.0.1/js/member_acknowledgement.js"></script> </body> </html>