Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 123 results for author: <span class="mathjax">Huang, R</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/eess" aria-role="search"> Searching in archive <strong>eess</strong>. <a href="/search/?searchtype=author&query=Huang%2C+R">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Huang, R"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Huang%2C+R&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Huang, R"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Huang%2C+R&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Huang%2C+R&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Huang%2C+R&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Huang%2C+R&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01805">arXiv:2411.01805</a> <span> [<a href="https://arxiv.org/pdf/2411.01805">pdf</a>, <a href="https://arxiv.org/format/2411.01805">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> MoMu-Diffusion: On Learning Long-Term Motion-Music Synchronization and Correspondence </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=You%2C+F">Fuming You</a>, <a href="/search/eess?searchtype=author&query=Fang%2C+M">Minghui Fang</a>, <a href="/search/eess?searchtype=author&query=Tang%2C+L">Li Tang</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+R">Rongjie Huang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yongqi Wang</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+Z">Zhou Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01805v1-abstract-short" style="display: inline;"> Motion-to-music and music-to-motion have been studied separately, each attracting substantial research interest within their respective domains. The interaction between human motion and music is a reflection of advanced human intelligence, and establishing a unified relationship between them is particularly important. However, to date, there has been no work that considers them jointly to explore… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01805v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01805v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01805v1-abstract-full" style="display: none;"> Motion-to-music and music-to-motion have been studied separately, each attracting substantial research interest within their respective domains. The interaction between human motion and music is a reflection of advanced human intelligence, and establishing a unified relationship between them is particularly important. However, to date, there has been no work that considers them jointly to explore the modality alignment within. To bridge this gap, we propose a novel framework, termed MoMu-Diffusion, for long-term and synchronous motion-music generation. Firstly, to mitigate the huge computational costs raised by long sequences, we propose a novel Bidirectional Contrastive Rhythmic Variational Auto-Encoder (BiCoR-VAE) that extracts the modality-aligned latent representations for both motion and music inputs. Subsequently, leveraging the aligned latent spaces, we introduce a multi-modal Transformer-based diffusion model and a cross-guidance sampling strategy to enable various generation tasks, including cross-modal, multi-modal, and variable-length generation. Extensive experiments demonstrate that MoMu-Diffusion surpasses recent state-of-the-art methods both qualitatively and quantitatively, and can synthesize realistic, diverse, long-term, and beat-matched music or motion sequences. The generated samples and codes are available at https://momu-diffusion.github.io/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01805v1-abstract-full').style.display = 'none'; document.getElementById('2411.01805v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21269">arXiv:2410.21269</a> <span> [<a href="https://arxiv.org/pdf/2410.21269">pdf</a>, <a href="https://arxiv.org/format/2410.21269">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> OmniSep: Unified Omni-Modality Sound Separation with Query-Mixup </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Cheng%2C+X">Xize Cheng</a>, <a href="/search/eess?searchtype=author&query=Zheng%2C+S">Siqi Zheng</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Z">Zehan Wang</a>, <a href="/search/eess?searchtype=author&query=Fang%2C+M">Minghui Fang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Z">Ziang Zhang</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+R">Rongjie Huang</a>, <a href="/search/eess?searchtype=author&query=Ma%2C+Z">Ziyang Ma</a>, <a href="/search/eess?searchtype=author&query=Ji%2C+S">Shengpeng Ji</a>, <a href="/search/eess?searchtype=author&query=Zuo%2C+J">Jialong Zuo</a>, <a href="/search/eess?searchtype=author&query=Jin%2C+T">Tao Jin</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+Z">Zhou Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21269v1-abstract-short" style="display: inline;"> The scaling up has brought tremendous success in the fields of vision and language in recent years. When it comes to audio, however, researchers encounter a major challenge in scaling up the training data, as most natural audio contains diverse interfering signals. To address this limitation, we introduce Omni-modal Sound Separation (OmniSep), a novel framework capable of isolating clean soundtrac… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21269v1-abstract-full').style.display = 'inline'; document.getElementById('2410.21269v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21269v1-abstract-full" style="display: none;"> The scaling up has brought tremendous success in the fields of vision and language in recent years. When it comes to audio, however, researchers encounter a major challenge in scaling up the training data, as most natural audio contains diverse interfering signals. To address this limitation, we introduce Omni-modal Sound Separation (OmniSep), a novel framework capable of isolating clean soundtracks based on omni-modal queries, encompassing both single-modal and multi-modal composed queries. Specifically, we introduce the Query-Mixup strategy, which blends query features from different modalities during training. This enables OmniSep to optimize multiple modalities concurrently, effectively bringing all modalities under a unified framework for sound separation. We further enhance this flexibility by allowing queries to influence sound separation positively or negatively, facilitating the retention or removal of specific sounds as desired. Finally, OmniSep employs a retrieval-augmented approach known as Query-Aug, which enables open-vocabulary sound separation. Experimental evaluations on MUSIC, VGGSOUND-CLEAN+, and MUSIC-CLEAN+ datasets demonstrate effectiveness of OmniSep, achieving state-of-the-art performance in text-, image-, and audio-queried sound separation tasks. For samples and further information, please visit the demo page at \url{https://omnisep.github.io/}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21269v1-abstract-full').style.display = 'none'; document.getElementById('2410.21269v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Working in progress</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.12266">arXiv:2410.12266</a> <span> [<a href="https://arxiv.org/pdf/2410.12266">pdf</a>, <a href="https://arxiv.org/format/2410.12266">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> FlashAudio: Rectified Flows for Fast and High-Fidelity Text-to-Audio Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Liu%2C+H">Huadai Liu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+J">Jialei Wang</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+R">Rongjie Huang</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+Y">Yang Liu</a>, <a href="/search/eess?searchtype=author&query=Lu%2C+H">Heng Lu</a>, <a href="/search/eess?searchtype=author&query=Xue%2C+W">Wei Xue</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+Z">Zhou Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.12266v1-abstract-short" style="display: inline;"> Recent advancements in latent diffusion models (LDMs) have markedly enhanced text-to-audio generation, yet their iterative sampling processes impose substantial computational demands, limiting practical deployment. While recent methods utilizing consistency-based distillation aim to achieve few-step or single-step inference, their one-step performance is constrained by curved trajectories, prevent… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12266v1-abstract-full').style.display = 'inline'; document.getElementById('2410.12266v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.12266v1-abstract-full" style="display: none;"> Recent advancements in latent diffusion models (LDMs) have markedly enhanced text-to-audio generation, yet their iterative sampling processes impose substantial computational demands, limiting practical deployment. While recent methods utilizing consistency-based distillation aim to achieve few-step or single-step inference, their one-step performance is constrained by curved trajectories, preventing them from surpassing traditional diffusion models. In this work, we introduce FlashAudio with rectified flows to learn straight flow for fast simulation. To alleviate the inefficient timesteps allocation and suboptimal distribution of noise, FlashAudio optimizes the time distribution of rectified flow with Bifocal Samplers and proposes immiscible flow to minimize the total distance of data-noise pairs in a batch vias assignment. Furthermore, to address the amplified accumulation error caused by the classifier-free guidance (CFG), we propose Anchored Optimization, which refines the guidance scale by anchoring it to a reference trajectory. Experimental results on text-to-audio generation demonstrate that FlashAudio's one-step generation performance surpasses the diffusion-based models with hundreds of sampling steps on audio quality and enables a sampling speed of 400x faster than real-time on a single NVIDIA 4090Ti GPU. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12266v1-abstract-full').style.display = 'none'; document.getElementById('2410.12266v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.15977">arXiv:2409.15977</a> <span> [<a href="https://arxiv.org/pdf/2409.15977">pdf</a>, <a href="https://arxiv.org/format/2409.15977">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> TCSinger: Zero-Shot Singing Voice Synthesis with Style Transfer and Multi-Level Style Control </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+Y">Yu Zhang</a>, <a href="/search/eess?searchtype=author&query=Jiang%2C+Z">Ziyue Jiang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+R">Ruiqi Li</a>, <a href="/search/eess?searchtype=author&query=Pan%2C+C">Changhao Pan</a>, <a href="/search/eess?searchtype=author&query=He%2C+J">Jinzheng He</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+R">Rongjie Huang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+C">Chuxin Wang</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+Z">Zhou Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.15977v3-abstract-short" style="display: inline;"> Zero-shot singing voice synthesis (SVS) with style transfer and style control aims to generate high-quality singing voices with unseen timbres and styles (including singing method, emotion, rhythm, technique, and pronunciation) from audio and text prompts. However, the multifaceted nature of singing styles poses a significant challenge for effective modeling, transfer, and control. Furthermore, cu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15977v3-abstract-full').style.display = 'inline'; document.getElementById('2409.15977v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.15977v3-abstract-full" style="display: none;"> Zero-shot singing voice synthesis (SVS) with style transfer and style control aims to generate high-quality singing voices with unseen timbres and styles (including singing method, emotion, rhythm, technique, and pronunciation) from audio and text prompts. However, the multifaceted nature of singing styles poses a significant challenge for effective modeling, transfer, and control. Furthermore, current SVS models often fail to generate singing voices rich in stylistic nuances for unseen singers. To address these challenges, we introduce TCSinger, the first zero-shot SVS model for style transfer across cross-lingual speech and singing styles, along with multi-level style control. Specifically, TCSinger proposes three primary modules: 1) the clustering style encoder employs a clustering vector quantization model to stably condense style information into a compact latent space; 2) the Style and Duration Language Model (S\&D-LM) concurrently predicts style information and phoneme duration, which benefits both; 3) the style adaptive decoder uses a novel mel-style adaptive normalization method to generate singing voices with enhanced details. Experimental results show that TCSinger outperforms all baseline models in synthesis quality, singer similarity, and style controllability across various tasks, including zero-shot style transfer, multi-level style control, cross-lingual style transfer, and speech-to-singing style transfer. Singing voice samples can be accessed at https://tcsinger.github.io/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15977v3-abstract-full').style.display = 'none'; document.getElementById('2409.15977v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by EMNLP 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.16532">arXiv:2408.16532</a> <span> [<a href="https://arxiv.org/pdf/2408.16532">pdf</a>, <a href="https://arxiv.org/format/2408.16532">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> WavTokenizer: an Efficient Acoustic Discrete Codec Tokenizer for Audio Language Modeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ji%2C+S">Shengpeng Ji</a>, <a href="/search/eess?searchtype=author&query=Jiang%2C+Z">Ziyue Jiang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+W">Wen Wang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yifu Chen</a>, <a href="/search/eess?searchtype=author&query=Fang%2C+M">Minghui Fang</a>, <a href="/search/eess?searchtype=author&query=Zuo%2C+J">Jialong Zuo</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+Q">Qian Yang</a>, <a href="/search/eess?searchtype=author&query=Cheng%2C+X">Xize Cheng</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Z">Zehan Wang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+R">Ruiqi Li</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Z">Ziang Zhang</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+X">Xiaoda Yang</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+R">Rongjie Huang</a>, <a href="/search/eess?searchtype=author&query=Jiang%2C+Y">Yidi Jiang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Q">Qian Chen</a>, <a href="/search/eess?searchtype=author&query=Zheng%2C+S">Siqi Zheng</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+W">Wen Wang</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+Z">Zhou Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.16532v2-abstract-short" style="display: inline;"> Language models have been effectively applied to modeling natural signals, such as images, video, speech, and audio. A crucial component of these models is the codec tokenizer, which compresses high-dimensional natural signals into lower-dimensional discrete tokens. In this paper, we introduce WavTokenizer, which offers several advantages over previous SOTA acoustic codec models in the audio domai… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.16532v2-abstract-full').style.display = 'inline'; document.getElementById('2408.16532v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.16532v2-abstract-full" style="display: none;"> Language models have been effectively applied to modeling natural signals, such as images, video, speech, and audio. A crucial component of these models is the codec tokenizer, which compresses high-dimensional natural signals into lower-dimensional discrete tokens. In this paper, we introduce WavTokenizer, which offers several advantages over previous SOTA acoustic codec models in the audio domain: 1)extreme compression. By compressing the layers of quantizers and the temporal dimension of the discrete codec, one-second audio of 24kHz sampling rate requires only a single quantizer with 40 or 75 tokens. 2)improved subjective quality. Despite the reduced number of tokens, WavTokenizer achieves state-of-the-art reconstruction quality with outstanding UTMOS scores and inherently contains richer semantic information. Specifically, we achieve these results by designing a broader VQ space, extended contextual windows, and improved attention networks, as well as introducing a powerful multi-scale discriminator and an inverse Fourier transform structure. We conducted extensive reconstruction experiments in the domains of speech, audio, and music. WavTokenizer exhibited strong performance across various objective and subjective metrics compared to state-of-the-art models. We also tested semantic information, VQ utilization, and adaptability to generative models. Comprehensive ablation studies confirm the necessity of each module in WavTokenizer. The related code, demos, and pre-trained models are available at https://github.com/jishengpeng/WavTokenizer. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.16532v2-abstract-full').style.display = 'none'; document.getElementById('2408.16532v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Working in progress</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.13893">arXiv:2408.13893</a> <span> [<a href="https://arxiv.org/pdf/2408.13893">pdf</a>, <a href="https://arxiv.org/format/2408.13893">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> SimpleSpeech 2: Towards Simple and Efficient Text-to-Speech with Flow-based Scalar Latent Transformer Diffusion Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yang%2C+D">Dongchao Yang</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+R">Rongjie Huang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yuanyuan Wang</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+H">Haohan Guo</a>, <a href="/search/eess?searchtype=author&query=Chong%2C+D">Dading Chong</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+S">Songxiang Liu</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+X">Xixin Wu</a>, <a href="/search/eess?searchtype=author&query=Meng%2C+H">Helen Meng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.13893v2-abstract-short" style="display: inline;"> Scaling Text-to-speech (TTS) to large-scale datasets has been demonstrated as an effective method for improving the diversity and naturalness of synthesized speech. At the high level, previous large-scale TTS models can be categorized into either Auto-regressive (AR) based (\textit{e.g.}, VALL-E) or Non-auto-regressive (NAR) based models (\textit{e.g.}, NaturalSpeech 2/3). Although these works dem… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.13893v2-abstract-full').style.display = 'inline'; document.getElementById('2408.13893v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.13893v2-abstract-full" style="display: none;"> Scaling Text-to-speech (TTS) to large-scale datasets has been demonstrated as an effective method for improving the diversity and naturalness of synthesized speech. At the high level, previous large-scale TTS models can be categorized into either Auto-regressive (AR) based (\textit{e.g.}, VALL-E) or Non-auto-regressive (NAR) based models (\textit{e.g.}, NaturalSpeech 2/3). Although these works demonstrate good performance, they still have potential weaknesses. For instance, AR-based models are plagued by unstable generation quality and slow generation speed; meanwhile, some NAR-based models need phoneme-level duration alignment information, thereby increasing the complexity of data pre-processing, model design, and loss design. In this work, we build upon our previous publication by implementing a simple and efficient non-autoregressive (NAR) TTS framework, termed SimpleSpeech 2. SimpleSpeech 2 effectively combines the strengths of both autoregressive (AR) and non-autoregressive (NAR) methods, offering the following key advantages: (1) simplified data preparation; (2) straightforward model and loss design; and (3) stable, high-quality generation performance with fast inference speed. Compared to our previous publication, we present ({\romannumeral1}) a detailed analysis of the influence of speech tokenizer and noisy label for TTS performance; ({\romannumeral2}) four distinct types of sentence duration predictors; ({\romannumeral3}) a novel flow-based scalar latent transformer diffusion model. With these improvement, we show a significant improvement in generation performance and generation speed compared to our previous work and other state-of-the-art (SOTA) large-scale TTS models. Furthermore, we show that SimpleSpeech 2 can be seamlessly extended to multilingual TTS by training it on multilingual speech datasets. Demos are available on: {https://dongchaoyang.top/SimpleSpeech2\_demo/}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.13893v2-abstract-full').style.display = 'none'; document.getElementById('2408.13893v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submit to TASLP</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.12102">arXiv:2408.12102</a> <span> [<a href="https://arxiv.org/pdf/2408.12102">pdf</a>, <a href="https://arxiv.org/format/2408.12102">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Integrating Audio, Visual, and Semantic Information for Enhanced Multimodal Speaker Diarization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Cheng%2C+L">Luyao Cheng</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+H">Hui Wang</a>, <a href="/search/eess?searchtype=author&query=Zheng%2C+S">Siqi Zheng</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yafeng Chen</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+R">Rongjie Huang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Q">Qinglin Zhang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Q">Qian Chen</a>, <a href="/search/eess?searchtype=author&query=Li%2C+X">Xihao Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.12102v1-abstract-short" style="display: inline;"> Speaker diarization, the process of segmenting an audio stream or transcribed speech content into homogenous partitions based on speaker identity, plays a crucial role in the interpretation and analysis of human speech. Most existing speaker diarization systems rely exclusively on unimodal acoustic information, making the task particularly challenging due to the innate ambiguities of audio signals… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12102v1-abstract-full').style.display = 'inline'; document.getElementById('2408.12102v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.12102v1-abstract-full" style="display: none;"> Speaker diarization, the process of segmenting an audio stream or transcribed speech content into homogenous partitions based on speaker identity, plays a crucial role in the interpretation and analysis of human speech. Most existing speaker diarization systems rely exclusively on unimodal acoustic information, making the task particularly challenging due to the innate ambiguities of audio signals. Recent studies have made tremendous efforts towards audio-visual or audio-semantic modeling to enhance performance. However, even the incorporation of up to two modalities often falls short in addressing the complexities of spontaneous and unstructured conversations. To exploit more meaningful dialogue patterns, we propose a novel multimodal approach that jointly utilizes audio, visual, and semantic cues to enhance speaker diarization. Our method elegantly formulates the multimodal modeling as a constrained optimization problem. First, we build insights into the visual connections among active speakers and the semantic interactions within spoken content, thereby establishing abundant pairwise constraints. Then we introduce a joint pairwise constraint propagation algorithm to cluster speakers based on these visual and semantic constraints. This integration effectively leverages the complementary strengths of different modalities, refining the affinity estimation between individual speaker embeddings. Extensive experiments conducted on multiple multimodal datasets demonstrate that our approach consistently outperforms state-of-the-art speaker diarization methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12102v1-abstract-full').style.display = 'none'; document.getElementById('2408.12102v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.13220">arXiv:2407.13220</a> <span> [<a href="https://arxiv.org/pdf/2407.13220">pdf</a>, <a href="https://arxiv.org/format/2407.13220">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> MEDIC: Zero-shot Music Editing with Disentangled Inversion Control </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Liu%2C+H">Huadai Liu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+J">Jialei Wang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+X">Xiangtai Li</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+R">Rongjie Huang</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+Y">Yang Liu</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+J">Jiayang Xu</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+Z">Zhou Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.13220v3-abstract-short" style="display: inline;"> Text-guided diffusion models make a paradigm shift in audio generation, facilitating the adaptability of source audio to conform to specific textual prompts. Recent works introduce inversion techniques, like DDIM inversion, to zero-shot editing, exploiting pretrained diffusion models for audio modification. Nonetheless, our investigation exposes that DDIM inversion suffers from an accumulation of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.13220v3-abstract-full').style.display = 'inline'; document.getElementById('2407.13220v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.13220v3-abstract-full" style="display: none;"> Text-guided diffusion models make a paradigm shift in audio generation, facilitating the adaptability of source audio to conform to specific textual prompts. Recent works introduce inversion techniques, like DDIM inversion, to zero-shot editing, exploiting pretrained diffusion models for audio modification. Nonetheless, our investigation exposes that DDIM inversion suffers from an accumulation of errors across each diffusion step, undermining its efficacy. Moreover, existing editing methods fail to achieve effective complex non-rigid music editing while maintaining essential content preservation and high editing fidelity. To counteract these issues, we introduce the Disentangled Inversion technique to disentangle the diffusion process into triple branches, rectifying the deviated path of the source branch caused by DDIM inversion. In addition, we propose the Harmonized Attention Control framework, which unifies the mutual self-attention control and cross-attention control with an intermediate Harmonic Branch to progressively achieve the desired harmonic and melodic information in the target music. Collectively, these innovations comprise the Disentangled Inversion Control (DIC) framework, enabling accurate music editing while safeguarding content integrity. To benchmark audio editing efficacy, we introduce ZoME-Bench, a comprehensive music editing benchmark hosting 1,100 samples spread across ten distinct editing categories. This facilitates both zero-shot and instruction-based music editing tasks. Our method achieves unparalleled performance in edit fidelity and essential content preservation, outperforming contemporary state-of-the-art inversion techniques. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.13220v3-abstract-full').style.display = 'none'; document.getElementById('2407.13220v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.10303">arXiv:2407.10303</a> <span> [<a href="https://arxiv.org/pdf/2407.10303">pdf</a>, <a href="https://arxiv.org/format/2407.10303">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Improving Neural Biasing for Contextual Speech Recognition by Early Context Injection and Text Perturbation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Huang%2C+R">Ruizhe Huang</a>, <a href="/search/eess?searchtype=author&query=Yarmohammadi%2C+M">Mahsa Yarmohammadi</a>, <a href="/search/eess?searchtype=author&query=Khudanpur%2C+S">Sanjeev Khudanpur</a>, <a href="/search/eess?searchtype=author&query=Povey%2C+D">Daniel Povey</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.10303v1-abstract-short" style="display: inline;"> Existing research suggests that automatic speech recognition (ASR) models can benefit from additional contexts (e.g., contact lists, user specified vocabulary). Rare words and named entities can be better recognized with contexts. In this work, we propose two simple yet effective techniques to improve context-aware ASR models. First, we inject contexts into the encoders at an early stage instead o… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.10303v1-abstract-full').style.display = 'inline'; document.getElementById('2407.10303v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.10303v1-abstract-full" style="display: none;"> Existing research suggests that automatic speech recognition (ASR) models can benefit from additional contexts (e.g., contact lists, user specified vocabulary). Rare words and named entities can be better recognized with contexts. In this work, we propose two simple yet effective techniques to improve context-aware ASR models. First, we inject contexts into the encoders at an early stage instead of merely at their last layers. Second, to enforce the model to leverage the contexts during training, we perturb the reference transcription with alternative spellings so that the model learns to rely on the contexts to make correct predictions. On LibriSpeech, our techniques together reduce the rare word error rate by 60% and 25% relatively compared to no biasing and shallow fusion, making the new state-of-the-art performance. On SPGISpeech and a real-world dataset ConEC, our techniques also yield good improvements over the baselines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.10303v1-abstract-full').style.display = 'none'; document.getElementById('2407.10303v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to INTERSPEECH 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.02049">arXiv:2407.02049</a> <span> [<a href="https://arxiv.org/pdf/2407.02049">pdf</a>, <a href="https://arxiv.org/format/2407.02049">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Accompanied Singing Voice Synthesis with Fully Text-controlled Melody </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+R">Ruiqi Li</a>, <a href="/search/eess?searchtype=author&query=Hong%2C+Z">Zhiqing Hong</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yongqi Wang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+L">Lichao Zhang</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+R">Rongjie Huang</a>, <a href="/search/eess?searchtype=author&query=Zheng%2C+S">Siqi Zheng</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+Z">Zhou Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.02049v1-abstract-short" style="display: inline;"> Text-to-song (TTSong) is a music generation task that synthesizes accompanied singing voices. Current TTSong methods, inherited from singing voice synthesis (SVS), require melody-related information that can sometimes be impractical, such as music scores or MIDI sequences. We present MelodyLM, the first TTSong model that generates high-quality song pieces with fully text-controlled melodies, achie… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.02049v1-abstract-full').style.display = 'inline'; document.getElementById('2407.02049v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.02049v1-abstract-full" style="display: none;"> Text-to-song (TTSong) is a music generation task that synthesizes accompanied singing voices. Current TTSong methods, inherited from singing voice synthesis (SVS), require melody-related information that can sometimes be impractical, such as music scores or MIDI sequences. We present MelodyLM, the first TTSong model that generates high-quality song pieces with fully text-controlled melodies, achieving minimal user requirements and maximum control flexibility. MelodyLM explicitly models MIDI as the intermediate melody-related feature and sequentially generates vocal tracks in a language model manner, conditioned on textual and vocal prompts. The accompaniment music is subsequently synthesized by a latent diffusion model with hybrid conditioning for temporal alignment. With minimal requirements, users only need to input lyrics and a reference voice to synthesize a song sample. For full control, just input textual prompts or even directly input MIDI. Experimental results indicate that MelodyLM achieves superior performance in terms of both objective and subjective metrics. Audio samples are available at https://melodylm666.github.io. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.02049v1-abstract-full').style.display = 'none'; document.getElementById('2407.02049v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Working in progress</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.10056">arXiv:2406.10056</a> <span> [<a href="https://arxiv.org/pdf/2406.10056">pdf</a>, <a href="https://arxiv.org/format/2406.10056">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> UniAudio 1.5: Large Language Model-driven Audio Codec is A Few-shot Audio Task Learner </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yang%2C+D">Dongchao Yang</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+H">Haohan Guo</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yuanyuan Wang</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+R">Rongjie Huang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+X">Xiang Li</a>, <a href="/search/eess?searchtype=author&query=Tan%2C+X">Xu Tan</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+X">Xixin Wu</a>, <a href="/search/eess?searchtype=author&query=Meng%2C+H">Helen Meng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.10056v1-abstract-short" style="display: inline;"> The Large Language models (LLMs) have demonstrated supreme capabilities in text understanding and generation, but cannot be directly applied to cross-modal tasks without fine-tuning. This paper proposes a cross-modal in-context learning approach, empowering the frozen LLMs to achieve multiple audio tasks in a few-shot style without any parameter update. Specifically, we propose a novel and LLMs-dr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.10056v1-abstract-full').style.display = 'inline'; document.getElementById('2406.10056v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.10056v1-abstract-full" style="display: none;"> The Large Language models (LLMs) have demonstrated supreme capabilities in text understanding and generation, but cannot be directly applied to cross-modal tasks without fine-tuning. This paper proposes a cross-modal in-context learning approach, empowering the frozen LLMs to achieve multiple audio tasks in a few-shot style without any parameter update. Specifically, we propose a novel and LLMs-driven audio codec model, LLM-Codec, to transfer the audio modality into the textual space, \textit{i.e.} representing audio tokens with words or sub-words in the vocabulary of LLMs, while keeping high audio reconstruction quality. The key idea is to reduce the modality heterogeneity between text and audio by compressing the audio modality into a well-trained LLMs token space. Thus, the audio representation can be viewed as a new \textit{foreign language}, and LLMs can learn the new \textit{foreign language} with several demonstrations. In experiments, we investigate the performance of the proposed approach across multiple audio understanding and generation tasks, \textit{e.g.} speech emotion classification, audio classification, text-to-speech generation, speech enhancement, etc. The experimental results demonstrate that the LLMs equipped with the proposed LLM-Codec, named as UniAudio 1.5, prompted by only a few examples, can achieve the expected functions in simple scenarios. It validates the feasibility and effectiveness of the proposed cross-modal in-context learning approach. To facilitate research on few-shot audio task learning and multi-modal LLMs, we have open-sourced the LLM-Codec model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.10056v1-abstract-full').style.display = 'none'; document.getElementById('2406.10056v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.04858">arXiv:2406.04858</a> <span> [<a href="https://arxiv.org/pdf/2406.04858">pdf</a>, <a href="https://arxiv.org/format/2406.04858">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Auto-Multilift: Distributed Learning and Control for Cooperative Load Transportation With Quadrotors </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+B">Bingheng Wang</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+R">Rui Huang</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+L">Lin Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.04858v5-abstract-short" style="display: inline;"> Designing motion control and planning algorithms for multilift systems remains challenging due to the complexities of dynamics, collision avoidance, actuator limits, and scalability. Existing methods that use optimization and distributed techniques effectively address these constraints and scalability issues. However, they often require substantial manual tuning, leading to suboptimal performance.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.04858v5-abstract-full').style.display = 'inline'; document.getElementById('2406.04858v5-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.04858v5-abstract-full" style="display: none;"> Designing motion control and planning algorithms for multilift systems remains challenging due to the complexities of dynamics, collision avoidance, actuator limits, and scalability. Existing methods that use optimization and distributed techniques effectively address these constraints and scalability issues. However, they often require substantial manual tuning, leading to suboptimal performance. This paper proposes Auto-Multilift, a novel framework that automates the tuning of model predictive controllers (MPCs) for multilift systems. We model the MPC cost functions with deep neural networks (DNNs), enabling fast online adaptation to various scenarios. We develop a distributed policy gradient algorithm to train these DNNs efficiently in a closed-loop manner. Central to our algorithm is distributed sensitivity propagation, which is built on fully exploiting the unique dynamic couplings within the multilift system. It parallelizes gradient computation across quadrotors and focuses on actual system state sensitivities relative to key MPC parameters. Extensive simulations demonstrate favorable scalability to a large number of quadrotors. Our method outperforms a state-of-the-art open-loop MPC tuning approach by effectively learning adaptive MPCs from trajectory tracking errors. It also excels in learning an adaptive reference for reconfiguring the system when traversing multiple narrow slots. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.04858v5-abstract-full').style.display = 'none'; document.getElementById('2406.04858v5-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.02560">arXiv:2406.02560</a> <span> [<a href="https://arxiv.org/pdf/2406.02560">pdf</a>, <a href="https://arxiv.org/format/2406.02560">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Less Peaky and More Accurate CTC Forced Alignment by Label Priors </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Huang%2C+R">Ruizhe Huang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+X">Xiaohui Zhang</a>, <a href="/search/eess?searchtype=author&query=Ni%2C+Z">Zhaoheng Ni</a>, <a href="/search/eess?searchtype=author&query=Sun%2C+L">Li Sun</a>, <a href="/search/eess?searchtype=author&query=Hira%2C+M">Moto Hira</a>, <a href="/search/eess?searchtype=author&query=Hwang%2C+J">Jeff Hwang</a>, <a href="/search/eess?searchtype=author&query=Manohar%2C+V">Vimal Manohar</a>, <a href="/search/eess?searchtype=author&query=Pratap%2C+V">Vineel Pratap</a>, <a href="/search/eess?searchtype=author&query=Wiesner%2C+M">Matthew Wiesner</a>, <a href="/search/eess?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a>, <a href="/search/eess?searchtype=author&query=Povey%2C+D">Daniel Povey</a>, <a href="/search/eess?searchtype=author&query=Khudanpur%2C+S">Sanjeev Khudanpur</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.02560v3-abstract-short" style="display: inline;"> Connectionist temporal classification (CTC) models are known to have peaky output distributions. Such behavior is not a problem for automatic speech recognition (ASR), but it can cause inaccurate forced alignments (FA), especially at finer granularity, e.g., phoneme level. This paper aims at alleviating the peaky behavior for CTC and improve its suitability for forced alignment generation, by leve… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.02560v3-abstract-full').style.display = 'inline'; document.getElementById('2406.02560v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.02560v3-abstract-full" style="display: none;"> Connectionist temporal classification (CTC) models are known to have peaky output distributions. Such behavior is not a problem for automatic speech recognition (ASR), but it can cause inaccurate forced alignments (FA), especially at finer granularity, e.g., phoneme level. This paper aims at alleviating the peaky behavior for CTC and improve its suitability for forced alignment generation, by leveraging label priors, so that the scores of alignment paths containing fewer blanks are boosted and maximized during training. As a result, our CTC model produces less peaky posteriors and is able to more accurately predict the offset of the tokens besides their onset. It outperforms the standard CTC model and a heuristics-based approach for obtaining CTC's token offset timestamps by 12-40% in phoneme and word boundary errors (PBE and WBE) measured on the Buckeye and TIMIT data. Compared with the most widely used FA toolkit Montreal Forced Aligner (MFA), our method performs similarly on PBE/WBE on Buckeye, yet falls behind MFA on TIMIT. Nevertheless, our method has a much simpler training pipeline and better runtime efficiency. Our training recipe and pretrained model are released in TorchAudio. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.02560v3-abstract-full').style.display = 'none'; document.getElementById('2406.02560v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICASSP 2024. Github repo: https://github.com/huangruizhe/audio/tree/aligner_label_priors</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.02429">arXiv:2406.02429</a> <span> [<a href="https://arxiv.org/pdf/2406.02429">pdf</a>, <a href="https://arxiv.org/format/2406.02429">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Self-Supervised Singing Voice Pre-Training towards Speech-to-Singing Conversion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+R">Ruiqi Li</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+R">Rongjie Huang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yongqi Wang</a>, <a href="/search/eess?searchtype=author&query=Hong%2C+Z">Zhiqing Hong</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+Z">Zhou Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.02429v1-abstract-short" style="display: inline;"> Speech-to-singing voice conversion (STS) task always suffers from data scarcity, because it requires paired speech and singing data. Compounding this issue are the challenges of content-pitch alignment and the suboptimal quality of generated outputs, presenting significant hurdles in STS research. This paper presents SVPT, an STS approach boosted by a self-supervised singing voice pre-training mod… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.02429v1-abstract-full').style.display = 'inline'; document.getElementById('2406.02429v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.02429v1-abstract-full" style="display: none;"> Speech-to-singing voice conversion (STS) task always suffers from data scarcity, because it requires paired speech and singing data. Compounding this issue are the challenges of content-pitch alignment and the suboptimal quality of generated outputs, presenting significant hurdles in STS research. This paper presents SVPT, an STS approach boosted by a self-supervised singing voice pre-training model. We leverage spoken language model techniques to tackle the rhythm alignment problem and the in-context learning capability to achieve zero-shot conversion. We adopt discrete-unit random resampling and pitch corruption strategies, enabling training with unpaired singing data and thus mitigating the issue of data scarcity. SVPT also serves as an effective backbone for singing voice synthesis (SVS), offering insights into scaling up SVS models. Experimental results indicate that SVPT delivers notable improvements in both STS and SVS endeavors. Audio samples are available at https://speech2sing.github.io. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.02429v1-abstract-full').style.display = 'none'; document.getElementById('2406.02429v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.00356">arXiv:2406.00356</a> <span> [<a href="https://arxiv.org/pdf/2406.00356">pdf</a>, <a href="https://arxiv.org/format/2406.00356">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> AudioLCM: Text-to-Audio Generation with Latent Consistency Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Liu%2C+H">Huadai Liu</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+R">Rongjie Huang</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+Y">Yang Liu</a>, <a href="/search/eess?searchtype=author&query=Cao%2C+H">Hengyuan Cao</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+J">Jialei Wang</a>, <a href="/search/eess?searchtype=author&query=Cheng%2C+X">Xize Cheng</a>, <a href="/search/eess?searchtype=author&query=Zheng%2C+S">Siqi Zheng</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+Z">Zhou Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.00356v2-abstract-short" style="display: inline;"> Recent advancements in Latent Diffusion Models (LDMs) have propelled them to the forefront of various generative tasks. However, their iterative sampling process poses a significant computational burden, resulting in slow generation speeds and limiting their application in text-to-audio generation deployment. In this work, we introduce AudioLCM, a novel consistency-based model tailored for efficie… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.00356v2-abstract-full').style.display = 'inline'; document.getElementById('2406.00356v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.00356v2-abstract-full" style="display: none;"> Recent advancements in Latent Diffusion Models (LDMs) have propelled them to the forefront of various generative tasks. However, their iterative sampling process poses a significant computational burden, resulting in slow generation speeds and limiting their application in text-to-audio generation deployment. In this work, we introduce AudioLCM, a novel consistency-based model tailored for efficient and high-quality text-to-audio generation. AudioLCM integrates Consistency Models into the generation process, facilitating rapid inference through a mapping from any point at any time step to the trajectory's initial point. To overcome the convergence issue inherent in LDMs with reduced sample iterations, we propose the Guided Latent Consistency Distillation with a multi-step Ordinary Differential Equation (ODE) solver. This innovation shortens the time schedule from thousands to dozens of steps while maintaining sample quality, thereby achieving fast convergence and high-quality generation. Furthermore, to optimize the performance of transformer-based neural network architectures, we integrate the advanced techniques pioneered by LLaMA into the foundational framework of transformers. This architecture supports stable and efficient training, ensuring robust performance in text-to-audio synthesis. Experimental results on text-to-sound generation and text-to-music synthesis tasks demonstrate that AudioLCM needs only 2 iterations to synthesize high-fidelity audios, while it maintains sample quality competitive with state-of-the-art models using hundreds of steps. AudioLCM enables a sampling speed of 333x faster than real-time on a single NVIDIA 4090Ti GPU, making generative models practically applicable to text-to-audio generation deployment. Our extensive preliminary analysis shows that each design in AudioLCM is effective. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.00356v2-abstract-full').style.display = 'none'; document.getElementById('2406.00356v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.00320">arXiv:2406.00320</a> <span> [<a href="https://arxiv.org/pdf/2406.00320">pdf</a>, <a href="https://arxiv.org/format/2406.00320">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Frieren: Efficient Video-to-Audio Generation Network with Rectified Flow Matching </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yongqi Wang</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+W">Wenxiang Guo</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+R">Rongjie Huang</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+J">Jiawei Huang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Z">Zehan Wang</a>, <a href="/search/eess?searchtype=author&query=You%2C+F">Fuming You</a>, <a href="/search/eess?searchtype=author&query=Li%2C+R">Ruiqi Li</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+Z">Zhou Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.00320v3-abstract-short" style="display: inline;"> Video-to-audio (V2A) generation aims to synthesize content-matching audio from silent video, and it remains challenging to build V2A models with high generation quality, efficiency, and visual-audio temporal synchrony. We propose Frieren, a V2A model based on rectified flow matching. Frieren regresses the conditional transport vector field from noise to spectrogram latent with straight paths and c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.00320v3-abstract-full').style.display = 'inline'; document.getElementById('2406.00320v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.00320v3-abstract-full" style="display: none;"> Video-to-audio (V2A) generation aims to synthesize content-matching audio from silent video, and it remains challenging to build V2A models with high generation quality, efficiency, and visual-audio temporal synchrony. We propose Frieren, a V2A model based on rectified flow matching. Frieren regresses the conditional transport vector field from noise to spectrogram latent with straight paths and conducts sampling by solving ODE, outperforming autoregressive and score-based models in terms of audio quality. By employing a non-autoregressive vector field estimator based on a feed-forward transformer and channel-level cross-modal feature fusion with strong temporal alignment, our model generates audio that is highly synchronized with the input video. Furthermore, through reflow and one-step distillation with guided vector field, our model can generate decent audio in a few, or even only one sampling step. Experiments indicate that Frieren achieves state-of-the-art performance in both generation quality and temporal alignment on VGGSound, with alignment accuracy reaching 97.22%, and 6.2% improvement in inception score over the strong diffusion-based baseline. Audio samples are available at http://frieren-v2a.github.io. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.00320v3-abstract-full').style.display = 'none'; document.getElementById('2406.00320v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted by NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.09940">arXiv:2405.09940</a> <span> [<a href="https://arxiv.org/pdf/2405.09940">pdf</a>, <a href="https://arxiv.org/format/2405.09940">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Robust Singing Voice Transcription Serves Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+R">Ruiqi Li</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Y">Yu Zhang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yongqi Wang</a>, <a href="/search/eess?searchtype=author&query=Hong%2C+Z">Zhiqing Hong</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+R">Rongjie Huang</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+Z">Zhou Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.09940v2-abstract-short" style="display: inline;"> Note-level Automatic Singing Voice Transcription (AST) converts singing recordings into note sequences, facilitating the automatic annotation of singing datasets for Singing Voice Synthesis (SVS) applications. Current AST methods, however, struggle with accuracy and robustness when used for practical annotation. This paper presents ROSVOT, the first robust AST model that serves SVS, incorporating… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.09940v2-abstract-full').style.display = 'inline'; document.getElementById('2405.09940v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.09940v2-abstract-full" style="display: none;"> Note-level Automatic Singing Voice Transcription (AST) converts singing recordings into note sequences, facilitating the automatic annotation of singing datasets for Singing Voice Synthesis (SVS) applications. Current AST methods, however, struggle with accuracy and robustness when used for practical annotation. This paper presents ROSVOT, the first robust AST model that serves SVS, incorporating a multi-scale framework that effectively captures coarse-grained note information and ensures fine-grained frame-level segmentation, coupled with an attention-based pitch decoder for reliable pitch prediction. We also established a comprehensive annotation-and-training pipeline for SVS to test the model in real-world settings. Experimental findings reveal that ROSVOT achieves state-of-the-art transcription accuracy with either clean or noisy inputs. Moreover, when trained on enlarged, automatically annotated datasets, the SVS model outperforms its baseline, affirming the capability for practical application. Audio samples are available at https://rosvot.github.io. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.09940v2-abstract-full').style.display = 'none'; document.getElementById('2405.09940v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ACL 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.09787">arXiv:2405.09787</a> <span> [<a href="https://arxiv.org/pdf/2405.09787">pdf</a>, <a href="https://arxiv.org/format/2405.09787">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Analysis of the BraTS 2023 Intracranial Meningioma Segmentation Challenge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=LaBella%2C+D">Dominic LaBella</a>, <a href="/search/eess?searchtype=author&query=Baid%2C+U">Ujjwal Baid</a>, <a href="/search/eess?searchtype=author&query=Khanna%2C+O">Omaditya Khanna</a>, <a href="/search/eess?searchtype=author&query=McBurney-Lin%2C+S">Shan McBurney-Lin</a>, <a href="/search/eess?searchtype=author&query=McLean%2C+R">Ryan McLean</a>, <a href="/search/eess?searchtype=author&query=Nedelec%2C+P">Pierre Nedelec</a>, <a href="/search/eess?searchtype=author&query=Rashid%2C+A">Arif Rashid</a>, <a href="/search/eess?searchtype=author&query=Tahon%2C+N+H">Nourel Hoda Tahon</a>, <a href="/search/eess?searchtype=author&query=Altes%2C+T">Talissa Altes</a>, <a href="/search/eess?searchtype=author&query=Bhalerao%2C+R">Radhika Bhalerao</a>, <a href="/search/eess?searchtype=author&query=Dhemesh%2C+Y">Yaseen Dhemesh</a>, <a href="/search/eess?searchtype=author&query=Godfrey%2C+D">Devon Godfrey</a>, <a href="/search/eess?searchtype=author&query=Hilal%2C+F">Fathi Hilal</a>, <a href="/search/eess?searchtype=author&query=Floyd%2C+S">Scott Floyd</a>, <a href="/search/eess?searchtype=author&query=Janas%2C+A">Anastasia Janas</a>, <a href="/search/eess?searchtype=author&query=Kazerooni%2C+A+F">Anahita Fathi Kazerooni</a>, <a href="/search/eess?searchtype=author&query=Kirkpatrick%2C+J">John Kirkpatrick</a>, <a href="/search/eess?searchtype=author&query=Kent%2C+C">Collin Kent</a>, <a href="/search/eess?searchtype=author&query=Kofler%2C+F">Florian Kofler</a>, <a href="/search/eess?searchtype=author&query=Leu%2C+K">Kevin Leu</a>, <a href="/search/eess?searchtype=author&query=Maleki%2C+N">Nazanin Maleki</a>, <a href="/search/eess?searchtype=author&query=Menze%2C+B">Bjoern Menze</a>, <a href="/search/eess?searchtype=author&query=Pajot%2C+M">Maxence Pajot</a>, <a href="/search/eess?searchtype=author&query=Reitman%2C+Z+J">Zachary J. Reitman</a>, <a href="/search/eess?searchtype=author&query=Rudie%2C+J+D">Jeffrey D. Rudie</a> , et al. (96 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.09787v1-abstract-short" style="display: inline;"> We describe the design and results from the BraTS 2023 Intracranial Meningioma Segmentation Challenge. The BraTS Meningioma Challenge differed from prior BraTS Glioma challenges in that it focused on meningiomas, which are typically benign extra-axial tumors with diverse radiologic and anatomical presentation and a propensity for multiplicity. Nine participating teams each developed deep-learning… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.09787v1-abstract-full').style.display = 'inline'; document.getElementById('2405.09787v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.09787v1-abstract-full" style="display: none;"> We describe the design and results from the BraTS 2023 Intracranial Meningioma Segmentation Challenge. The BraTS Meningioma Challenge differed from prior BraTS Glioma challenges in that it focused on meningiomas, which are typically benign extra-axial tumors with diverse radiologic and anatomical presentation and a propensity for multiplicity. Nine participating teams each developed deep-learning automated segmentation models using image data from the largest multi-institutional systematically expert annotated multilabel multi-sequence meningioma MRI dataset to date, which included 1000 training set cases, 141 validation set cases, and 283 hidden test set cases. Each case included T2, T2/FLAIR, T1, and T1Gd brain MRI sequences with associated tumor compartment labels delineating enhancing tumor, non-enhancing tumor, and surrounding non-enhancing T2/FLAIR hyperintensity. Participant automated segmentation models were evaluated and ranked based on a scoring system evaluating lesion-wise metrics including dice similarity coefficient (DSC) and 95% Hausdorff Distance. The top ranked team had a lesion-wise median dice similarity coefficient (DSC) of 0.976, 0.976, and 0.964 for enhancing tumor, tumor core, and whole tumor, respectively and a corresponding average DSC of 0.899, 0.904, and 0.871, respectively. These results serve as state-of-the-art benchmarks for future pre-operative meningioma automated segmentation algorithms. Additionally, we found that 1286 of 1424 cases (90.3%) had at least 1 compartment voxel abutting the edge of the skull-stripped image edge, which requires further investigation into optimal pre-processing face anonymization steps. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.09787v1-abstract-full').style.display = 'none'; document.getElementById('2405.09787v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages, 11 tables, 10 figures, MICCAI</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.09313">arXiv:2404.09313</a> <span> [<a href="https://arxiv.org/pdf/2404.09313">pdf</a>, <a href="https://arxiv.org/format/2404.09313">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Text-to-Song: Towards Controllable Music Generation Incorporating Vocals and Accompaniment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Hong%2C+Z">Zhiqing Hong</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+R">Rongjie Huang</a>, <a href="/search/eess?searchtype=author&query=Cheng%2C+X">Xize Cheng</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yongqi Wang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+R">Ruiqi Li</a>, <a href="/search/eess?searchtype=author&query=You%2C+F">Fuming You</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+Z">Zhou Zhao</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Z">Zhimeng Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.09313v3-abstract-short" style="display: inline;"> A song is a combination of singing voice and accompaniment. However, existing works focus on singing voice synthesis and music generation independently. Little attention was paid to explore song synthesis. In this work, we propose a novel task called text-to-song synthesis which incorporating both vocals and accompaniments generation. We develop Melodist, a two-stage text-to-song method that consi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.09313v3-abstract-full').style.display = 'inline'; document.getElementById('2404.09313v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.09313v3-abstract-full" style="display: none;"> A song is a combination of singing voice and accompaniment. However, existing works focus on singing voice synthesis and music generation independently. Little attention was paid to explore song synthesis. In this work, we propose a novel task called text-to-song synthesis which incorporating both vocals and accompaniments generation. We develop Melodist, a two-stage text-to-song method that consists of singing voice synthesis (SVS) and vocal-to-accompaniment (V2A) synthesis. Melodist leverages tri-tower contrastive pretraining to learn more effective text representation for controllable V2A synthesis. A Chinese song dataset mined from a music website is built up to alleviate data scarcity for our research. The evaluation results on our dataset demonstrate that Melodist can synthesize songs with comparable quality and style consistency. Audio samples can be found in https://text2songMelodist.github.io/Sample/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.09313v3-abstract-full').style.display = 'none'; document.getElementById('2404.09313v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ACL 2024 Main</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.19971">arXiv:2403.19971</a> <span> [<a href="https://arxiv.org/pdf/2403.19971">pdf</a>, <a href="https://arxiv.org/format/2403.19971">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> 3D-Speaker-Toolkit: An Open-Source Toolkit for Multimodal Speaker Verification and Diarization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yafeng Chen</a>, <a href="/search/eess?searchtype=author&query=Zheng%2C+S">Siqi Zheng</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+H">Hui Wang</a>, <a href="/search/eess?searchtype=author&query=Cheng%2C+L">Luyao Cheng</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+T">Tinglong Zhu</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+R">Rongjie Huang</a>, <a href="/search/eess?searchtype=author&query=Deng%2C+C">Chong Deng</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Q">Qian Chen</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+S">Shiliang Zhang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+W">Wen Wang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+X">Xihao Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.19971v2-abstract-short" style="display: inline;"> We introduce 3D-Speaker-Toolkit, an open-source toolkit for multimodal speaker verification and diarization, designed for meeting the needs of academic researchers and industrial practitioners. The 3D-Speaker-Toolkit adeptly leverages the combined strengths of acoustic, semantic, and visual data, seamlessly fusing these modalities to offer robust speaker recognition capabilities. The acoustic modu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.19971v2-abstract-full').style.display = 'inline'; document.getElementById('2403.19971v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.19971v2-abstract-full" style="display: none;"> We introduce 3D-Speaker-Toolkit, an open-source toolkit for multimodal speaker verification and diarization, designed for meeting the needs of academic researchers and industrial practitioners. The 3D-Speaker-Toolkit adeptly leverages the combined strengths of acoustic, semantic, and visual data, seamlessly fusing these modalities to offer robust speaker recognition capabilities. The acoustic module extracts speaker embeddings from acoustic features, employing both fully-supervised and self-supervised learning approaches. The semantic module leverages advanced language models to comprehend the substance and context of spoken language, thereby augmenting the system's proficiency in distinguishing speakers through linguistic patterns. The visual module applies image processing technologies to scrutinize facial features, which bolsters the precision of speaker diarization in multi-speaker environments. Collectively, these modules empower the 3D-Speaker-Toolkit to achieve substantially improved accuracy and reliability in speaker-related tasks. With 3D-Speaker-Toolkit, we establish a new benchmark for multimodal speaker analysis. The toolkit also includes a handful of open-source state-of-the-art models and a large-scale dataset containing over 10,000 speakers. The toolkit is publicly available at https://github.com/modelscope/3D-Speaker. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.19971v2-abstract-full').style.display = 'none'; document.getElementById('2403.19971v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.11780">arXiv:2403.11780</a> <span> [<a href="https://arxiv.org/pdf/2403.11780">pdf</a>, <a href="https://arxiv.org/format/2403.11780">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Prompt-Singer: Controllable Singing-Voice-Synthesis with Natural Language Prompt </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yongqi Wang</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+R">Ruofan Hu</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+R">Rongjie Huang</a>, <a href="/search/eess?searchtype=author&query=Hong%2C+Z">Zhiqing Hong</a>, <a href="/search/eess?searchtype=author&query=Li%2C+R">Ruiqi Li</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+W">Wenrui Liu</a>, <a href="/search/eess?searchtype=author&query=You%2C+F">Fuming You</a>, <a href="/search/eess?searchtype=author&query=Jin%2C+T">Tao Jin</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+Z">Zhou Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.11780v2-abstract-short" style="display: inline;"> Recent singing-voice-synthesis (SVS) methods have achieved remarkable audio quality and naturalness, yet they lack the capability to control the style attributes of the synthesized singing explicitly. We propose Prompt-Singer, the first SVS method that enables attribute controlling on singer gender, vocal range and volume with natural language. We adopt a model architecture based on a decoder-only… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.11780v2-abstract-full').style.display = 'inline'; document.getElementById('2403.11780v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.11780v2-abstract-full" style="display: none;"> Recent singing-voice-synthesis (SVS) methods have achieved remarkable audio quality and naturalness, yet they lack the capability to control the style attributes of the synthesized singing explicitly. We propose Prompt-Singer, the first SVS method that enables attribute controlling on singer gender, vocal range and volume with natural language. We adopt a model architecture based on a decoder-only transformer with a multi-scale hierarchy, and design a range-melody decoupled pitch representation that enables text-conditioned vocal range control while keeping melodic accuracy. Furthermore, we explore various experiment settings, including different types of text representations, text encoder fine-tuning, and introducing speech data to alleviate data scarcity, aiming to facilitate further research. Experiments show that our model achieves favorable controlling ability and audio quality. Audio samples are available at http://prompt-singer.github.io . <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.11780v2-abstract-full').style.display = 'none'; document.getElementById('2403.11780v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by NAACL 2024 (main conference)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.11081">arXiv:2403.11081</a> <span> [<a href="https://arxiv.org/pdf/2403.11081">pdf</a>, <a href="https://arxiv.org/format/2403.11081">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Enhanced Index Modulation Aided Non-Orthogonal Multiple Access via Constellation Rotation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Huang%2C+R">Ronglan Huang</a>, <a href="/search/eess?searchtype=author&query=ji%2C+F">Fei ji</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+Z">Zeng Hu</a>, <a href="/search/eess?searchtype=author&query=Wan%2C+D">Dehuan Wan</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+P">Pengcheng Xu</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+Y">Yun Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.11081v1-abstract-short" style="display: inline;"> Non-orthogonal multiple access (NOMA) has been widely nominated as an emerging spectral efficiency (SE) multiple access technique for the next generation of wireless communication network. To meet the growing demands in massive connectivity and huge data in transmission, a novel index modulation aided NOMA with the rotation of signal constellation of low power users (IM-NOMA-RC) is developed to th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.11081v1-abstract-full').style.display = 'inline'; document.getElementById('2403.11081v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.11081v1-abstract-full" style="display: none;"> Non-orthogonal multiple access (NOMA) has been widely nominated as an emerging spectral efficiency (SE) multiple access technique for the next generation of wireless communication network. To meet the growing demands in massive connectivity and huge data in transmission, a novel index modulation aided NOMA with the rotation of signal constellation of low power users (IM-NOMA-RC) is developed to the downlink transmission. In the proposed IM-NOMA-RC system, the users are classified into far-user group and near-user group according to their channel conditions, where the rotation constellation based IM operation is performed only on the users who belong to the near-user group that are allocated lower power compared with the far ones to transmit extra information. In the proposed IM-NOMA-RC, all the subcarriers are activated to transmit information to multiple users to achieve higher SE. With the aid of the multiple dimension modulation in IM-NOMA-RC, more users can be supported over an orthogonal resource block. Then, both maximum likelihood (ML) detector and successive interference cancellation (SIC) detector are studied for all the user. Numerical simulation results of the proposed IM-NOMARC scheme are investigate for the ML detector and the SIC detector for each users, which shows that proposed scheme can outperform conventional NOMA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.11081v1-abstract-full').style.display = 'none'; document.getElementById('2403.11081v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.12820">arXiv:2402.12820</a> <span> [<a href="https://arxiv.org/pdf/2402.12820">pdf</a>, <a href="https://arxiv.org/format/2402.12820">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> ASCEND: Accurate yet Efficient End-to-End Stochastic Computing Acceleration of Vision Transformer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Xie%2C+T">Tong Xie</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+Y">Yixuan Hu</a>, <a href="/search/eess?searchtype=author&query=Wei%2C+R">Renjie Wei</a>, <a href="/search/eess?searchtype=author&query=Li%2C+M">Meng Li</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yuan Wang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+R">Runsheng Wang</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+R">Ru Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.12820v1-abstract-short" style="display: inline;"> Stochastic computing (SC) has emerged as a promising computing paradigm for neural acceleration. However, how to accelerate the state-of-the-art Vision Transformer (ViT) with SC remains unclear. Unlike convolutional neural networks, ViTs introduce notable compatibility and efficiency challenges because of their nonlinear functions, e.g., softmax and Gaussian Error Linear Units (GELU). In this pape… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.12820v1-abstract-full').style.display = 'inline'; document.getElementById('2402.12820v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.12820v1-abstract-full" style="display: none;"> Stochastic computing (SC) has emerged as a promising computing paradigm for neural acceleration. However, how to accelerate the state-of-the-art Vision Transformer (ViT) with SC remains unclear. Unlike convolutional neural networks, ViTs introduce notable compatibility and efficiency challenges because of their nonlinear functions, e.g., softmax and Gaussian Error Linear Units (GELU). In this paper, for the first time, a ViT accelerator based on end-to-end SC, dubbed ASCEND, is proposed. ASCEND co-designs the SC circuits and ViT networks to enable accurate yet efficient acceleration. To overcome the compatibility challenges, ASCEND proposes a novel deterministic SC block for GELU and leverages an SC-friendly iterative approximate algorithm to design an accurate and efficient softmax circuit. To improve inference efficiency, ASCEND develops a two-stage training pipeline to produce accurate low-precision ViTs. With extensive experiments, we show the proposed GELU and softmax blocks achieve 56.3% and 22.6% error reduction compared to existing SC designs, respectively and reduce the area-delay product (ADP) by 5.29x and 12.6x, respectively. Moreover, compared to the baseline low-precision ViTs, ASCEND also achieves significant accuracy improvements on CIFAR10 and CIFAR100. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.12820v1-abstract-full').style.display = 'none'; document.getElementById('2402.12820v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted in DATE 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.12208">arXiv:2402.12208</a> <span> [<a href="https://arxiv.org/pdf/2402.12208">pdf</a>, <a href="https://arxiv.org/format/2402.12208">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Language-Codec: Reducing the Gaps Between Discrete Codec Representation and Speech Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ji%2C+S">Shengpeng Ji</a>, <a href="/search/eess?searchtype=author&query=Fang%2C+M">Minghui Fang</a>, <a href="/search/eess?searchtype=author&query=Jiang%2C+Z">Ziyue Jiang</a>, <a href="/search/eess?searchtype=author&query=Zheng%2C+S">Siqi Zheng</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Q">Qian Chen</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+R">Rongjie Huang</a>, <a href="/search/eess?searchtype=author&query=Zuo%2C+J">Jialung Zuo</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+S">Shulei Wang</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+Z">Zhou Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.12208v3-abstract-short" style="display: inline;"> In recent years, large language models have achieved significant success in generative tasks (e.g., speech cloning and audio generation) related to speech, audio, music, and other signal domains. A crucial element of these models is the discrete acoustic codecs, which serves as an intermediate representation replacing the mel-spectrogram. However, there exist several gaps between discrete codecs a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.12208v3-abstract-full').style.display = 'inline'; document.getElementById('2402.12208v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.12208v3-abstract-full" style="display: none;"> In recent years, large language models have achieved significant success in generative tasks (e.g., speech cloning and audio generation) related to speech, audio, music, and other signal domains. A crucial element of these models is the discrete acoustic codecs, which serves as an intermediate representation replacing the mel-spectrogram. However, there exist several gaps between discrete codecs and downstream speech language models. Specifically, 1) most codec models are trained on only 1,000 hours of data, whereas most speech language models are trained on 60,000 hours; 2) Achieving good reconstruction performance requires the utilization of numerous codebooks, which increases the burden on downstream speech language models; 3) The initial channel of the codebooks contains excessive information, making it challenging to directly generate acoustic tokens from weakly supervised signals such as text in downstream tasks. Consequently, leveraging the characteristics of speech language models, we propose Language-Codec. In the Language-Codec, we introduce a Mask Channel Residual Vector Quantization (MCRVQ) mechanism along with improved Fourier transform structures and larger training datasets to address the aforementioned gaps. We compare our method with competing audio compression algorithms and observe significant outperformance across extensive evaluations. Furthermore, we also validate the efficiency of the Language-Codec on downstream speech language models. The source code and pre-trained models can be accessed at https://github.com/jishengpeng/languagecodec . <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.12208v3-abstract-full').style.display = 'none'; document.getElementById('2402.12208v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">We release a more powerful checkpoint in Language-Codec v3</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.05554">arXiv:2402.05554</a> <span> [<a href="https://arxiv.org/pdf/2402.05554">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1016/j.ultrasmedbio.2023.10.009">10.1016/j.ultrasmedbio.2023.10.009 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> One-Stop Automated Diagnostic System for Carpal Tunnel Syndrome in Ultrasound Images Using Deep Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Peng%2C+J">Jiayu Peng</a>, <a href="/search/eess?searchtype=author&query=Zeng%2C+J">Jiajun Zeng</a>, <a href="/search/eess?searchtype=author&query=Lai%2C+M">Manlin Lai</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+R">Ruobing Huang</a>, <a href="/search/eess?searchtype=author&query=Ni%2C+D">Dong Ni</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Z">Zhenzhou Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.05554v1-abstract-short" style="display: inline;"> Objective: Ultrasound (US) examination has unique advantages in diagnosing carpal tunnel syndrome (CTS) while identifying the median nerve (MN) and diagnosing CTS depends heavily on the expertise of examiners. To alleviate this problem, we aimed to develop a one-stop automated CTS diagnosis system (OSA-CTSD) and evaluate its effectiveness as a computer-aided diagnostic tool. Methods: We combined r… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.05554v1-abstract-full').style.display = 'inline'; document.getElementById('2402.05554v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.05554v1-abstract-full" style="display: none;"> Objective: Ultrasound (US) examination has unique advantages in diagnosing carpal tunnel syndrome (CTS) while identifying the median nerve (MN) and diagnosing CTS depends heavily on the expertise of examiners. To alleviate this problem, we aimed to develop a one-stop automated CTS diagnosis system (OSA-CTSD) and evaluate its effectiveness as a computer-aided diagnostic tool. Methods: We combined real-time MN delineation, accurate biometric measurements, and explainable CTS diagnosis into a unified framework, called OSA-CTSD. We collected a total of 32,301 static images from US videos of 90 normal wrists and 40 CTS wrists for evaluation using a simplified scanning protocol. Results: The proposed model showed better segmentation and measurement performance than competing methods, reporting that HD95 score of 7.21px, ASSD score of 2.64px, Dice score of 85.78%, and IoU score of 76.00%, respectively. In the reader study, it demonstrated comparable performance with the average performance of the experienced in classifying the CTS, while outperformed that of the inexperienced radiologists in terms of classification metrics (e.g., accuracy score of 3.59% higher and F1 score of 5.85% higher). Conclusion: The OSA-CTSD demonstrated promising diagnostic performance with the advantages of real-time, automation, and clinical interpretability. The application of such a tool can not only reduce reliance on the expertise of examiners, but also can help to promote the future standardization of the CTS diagnosis process, benefiting both patients and radiologists. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.05554v1-abstract-full').style.display = 'none'; document.getElementById('2402.05554v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by Ultrasound in Medicine & Biology</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Ultrasound in Medicine & Biology, Volume 50, Issue 2, February 2024, Pages 304-314 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.04921">arXiv:2402.04921</a> <span> [<a href="https://arxiv.org/pdf/2402.04921">pdf</a>, <a href="https://arxiv.org/format/2402.04921">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Is Two-shot All You Need? A Label-efficient Approach for Video Segmentation in Breast Ultrasound </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zeng%2C+J">Jiajun Zeng</a>, <a href="/search/eess?searchtype=author&query=Ni%2C+D">Dong Ni</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+R">Ruobing Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.04921v2-abstract-short" style="display: inline;"> Breast lesion segmentation from breast ultrasound (BUS) videos could assist in early diagnosis and treatment. Existing video object segmentation (VOS) methods usually require dense annotation, which is often inaccessible for medical datasets. Furthermore, they suffer from accumulative errors and a lack of explicit space-time awareness. In this work, we propose a novel two-shot training paradigm fo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.04921v2-abstract-full').style.display = 'inline'; document.getElementById('2402.04921v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.04921v2-abstract-full" style="display: none;"> Breast lesion segmentation from breast ultrasound (BUS) videos could assist in early diagnosis and treatment. Existing video object segmentation (VOS) methods usually require dense annotation, which is often inaccessible for medical datasets. Furthermore, they suffer from accumulative errors and a lack of explicit space-time awareness. In this work, we propose a novel two-shot training paradigm for BUS video segmentation. It not only is able to capture free-range space-time consistency but also utilizes a source-dependent augmentation scheme. This label-efficient learning framework is validated on a challenging in-house BUS video dataset. Results showed that it gained comparable performance to the fully annotated ones given only 1.9% training labels. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.04921v2-abstract-full').style.display = 'none'; document.getElementById('2402.04921v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 4 figure, 2 tables, accepted by ISBI 2024</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.4.6 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.12789">arXiv:2401.12789</a> <span> [<a href="https://arxiv.org/pdf/2401.12789">pdf</a>, <a href="https://arxiv.org/format/2401.12789">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Multilingual and Fully Non-Autoregressive ASR with Large Language Model Fusion: A Comprehensive Study </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Huang%2C+W+R">W. Ronny Huang</a>, <a href="/search/eess?searchtype=author&query=Allauzen%2C+C">Cyril Allauzen</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+T">Tongzhou Chen</a>, <a href="/search/eess?searchtype=author&query=Gupta%2C+K">Kilol Gupta</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+K">Ke Hu</a>, <a href="/search/eess?searchtype=author&query=Qin%2C+J">James Qin</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Y">Yu Zhang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yongqiang Wang</a>, <a href="/search/eess?searchtype=author&query=Chang%2C+S">Shuo-Yiin Chang</a>, <a href="/search/eess?searchtype=author&query=Sainath%2C+T+N">Tara N. Sainath</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.12789v1-abstract-short" style="display: inline;"> In the era of large models, the autoregressive nature of decoding often results in latency serving as a significant bottleneck. We propose a non-autoregressive LM-fused ASR system that effectively leverages the parallelization capabilities of accelerator hardware. Our approach combines the Universal Speech Model (USM) and the PaLM 2 language model in per-segment scoring mode, achieving an average… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.12789v1-abstract-full').style.display = 'inline'; document.getElementById('2401.12789v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.12789v1-abstract-full" style="display: none;"> In the era of large models, the autoregressive nature of decoding often results in latency serving as a significant bottleneck. We propose a non-autoregressive LM-fused ASR system that effectively leverages the parallelization capabilities of accelerator hardware. Our approach combines the Universal Speech Model (USM) and the PaLM 2 language model in per-segment scoring mode, achieving an average relative WER improvement across all languages of 10.8% on FLEURS and 3.6% on YouTube captioning. Furthermore, our comprehensive ablation study analyzes key parameters such as LLM size, context length, vocabulary size, fusion methodology. For instance, we explore the impact of LLM size ranging from 128M to 340B parameters on ASR performance. This study provides valuable insights into the factors influencing the effectiveness of practical large-scale LM-fused speech recognition systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.12789v1-abstract-full').style.display = 'none'; document.getElementById('2401.12789v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICASSP 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.15197">arXiv:2312.15197</a> <span> [<a href="https://arxiv.org/pdf/2312.15197">pdf</a>, <a href="https://arxiv.org/format/2312.15197">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> TransFace: Unit-Based Audio-Visual Speech Synthesizer for Talking Head Translation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Cheng%2C+X">Xize Cheng</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+R">Rongjie Huang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+L">Linjun Li</a>, <a href="/search/eess?searchtype=author&query=Jin%2C+T">Tao Jin</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Z">Zehan Wang</a>, <a href="/search/eess?searchtype=author&query=Yin%2C+A">Aoxiong Yin</a>, <a href="/search/eess?searchtype=author&query=Li%2C+M">Minglei Li</a>, <a href="/search/eess?searchtype=author&query=Duan%2C+X">Xinyu Duan</a>, <a href="/search/eess?searchtype=author&query=yang%2C+c">changpeng yang</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+Z">Zhou Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.15197v1-abstract-short" style="display: inline;"> Direct speech-to-speech translation achieves high-quality results through the introduction of discrete units obtained from self-supervised learning. This approach circumvents delays and cascading errors associated with model cascading. However, talking head translation, converting audio-visual speech (i.e., talking head video) from one language into another, still confronts several challenges comp… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.15197v1-abstract-full').style.display = 'inline'; document.getElementById('2312.15197v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.15197v1-abstract-full" style="display: none;"> Direct speech-to-speech translation achieves high-quality results through the introduction of discrete units obtained from self-supervised learning. This approach circumvents delays and cascading errors associated with model cascading. However, talking head translation, converting audio-visual speech (i.e., talking head video) from one language into another, still confronts several challenges compared to audio speech: (1) Existing methods invariably rely on cascading, synthesizing via both audio and text, resulting in delays and cascading errors. (2) Talking head translation has a limited set of reference frames. If the generated translation exceeds the length of the original speech, the video sequence needs to be supplemented by repeating frames, leading to jarring video transitions. In this work, we propose a model for talking head translation, \textbf{TransFace}, which can directly translate audio-visual speech into audio-visual speech in other languages. It consists of a speech-to-unit translation model to convert audio speech into discrete units and a unit-based audio-visual speech synthesizer, Unit2Lip, to re-synthesize synchronized audio-visual speech from discrete units in parallel. Furthermore, we introduce a Bounded Duration Predictor, ensuring isometric talking head translation and preventing duplicate reference frames. Experiments demonstrate that our proposed Unit2Lip model significantly improves synchronization (1.601 and 0.982 on LSE-C for the original and generated audio speech, respectively) and boosts inference speed by a factor of 4.35 on LRS2. Additionally, TransFace achieves impressive BLEU scores of 61.93 and 47.55 for Es-En and Fr-En on LRS3-T and 100% isochronous translations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.15197v1-abstract-full').style.display = 'none'; document.getElementById('2312.15197v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.10741">arXiv:2312.10741</a> <span> [<a href="https://arxiv.org/pdf/2312.10741">pdf</a>, <a href="https://arxiv.org/format/2312.10741">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1609/aaai.v38i17.29932">10.1609/aaai.v38i17.29932 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> StyleSinger: Style Transfer for Out-of-Domain Singing Voice Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+Y">Yu Zhang</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+R">Rongjie Huang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+R">Ruiqi Li</a>, <a href="/search/eess?searchtype=author&query=He%2C+J">JinZheng He</a>, <a href="/search/eess?searchtype=author&query=Xia%2C+Y">Yan Xia</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+F">Feiyang Chen</a>, <a href="/search/eess?searchtype=author&query=Duan%2C+X">Xinyu Duan</a>, <a href="/search/eess?searchtype=author&query=Huai%2C+B">Baoxing Huai</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+Z">Zhou Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.10741v3-abstract-short" style="display: inline;"> Style transfer for out-of-domain (OOD) singing voice synthesis (SVS) focuses on generating high-quality singing voices with unseen styles (such as timbre, emotion, pronunciation, and articulation skills) derived from reference singing voice samples. However, the endeavor to model the intricate nuances of singing voice styles is an arduous task, as singing voices possess a remarkable degree of expr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.10741v3-abstract-full').style.display = 'inline'; document.getElementById('2312.10741v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.10741v3-abstract-full" style="display: none;"> Style transfer for out-of-domain (OOD) singing voice synthesis (SVS) focuses on generating high-quality singing voices with unseen styles (such as timbre, emotion, pronunciation, and articulation skills) derived from reference singing voice samples. However, the endeavor to model the intricate nuances of singing voice styles is an arduous task, as singing voices possess a remarkable degree of expressiveness. Moreover, existing SVS methods encounter a decline in the quality of synthesized singing voices in OOD scenarios, as they rest upon the assumption that the target vocal attributes are discernible during the training phase. To overcome these challenges, we propose StyleSinger, the first singing voice synthesis model for zero-shot style transfer of out-of-domain reference singing voice samples. StyleSinger incorporates two critical approaches for enhanced effectiveness: 1) the Residual Style Adaptor (RSA) which employs a residual quantization module to capture diverse style characteristics in singing voices, and 2) the Uncertainty Modeling Layer Normalization (UMLN) to perturb the style attributes within the content representation during the training phase and thus improve the model generalization. Our extensive evaluations in zero-shot style transfer undeniably establish that StyleSinger outperforms baseline models in both audio quality and similarity to the reference singing voice samples. Access to singing voice samples can be found at https://stylesinger.github.io/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.10741v3-abstract-full').style.display = 'none'; document.getElementById('2312.10741v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by AAAI 2024</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Proceedings of the AAAI Conference on Artificial Intelligence, 38(17), 19597-19605. (2024) </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.18216">arXiv:2311.18216</a> <span> [<a href="https://arxiv.org/pdf/2311.18216">pdf</a>, <a href="https://arxiv.org/format/2311.18216">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> FS-BAND: A Frequency-Sensitive Banding Detector </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Chen%2C+Z">Zijian Chen</a>, <a href="/search/eess?searchtype=author&query=Sun%2C+W">Wei Sun</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Z">Zicheng Zhang</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+R">Ru Huang</a>, <a href="/search/eess?searchtype=author&query=Lu%2C+F">Fangfang Lu</a>, <a href="/search/eess?searchtype=author&query=Min%2C+X">Xiongkuo Min</a>, <a href="/search/eess?searchtype=author&query=Zhai%2C+G">Guangtao Zhai</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+W">Wenjun Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.18216v1-abstract-short" style="display: inline;"> Banding artifact, as known as staircase-like contour, is a common quality annoyance that happens in compression, transmission, etc. scenarios, which largely affects the user's quality of experience (QoE). The banding distortion typically appears as relatively small pixel-wise variations in smooth backgrounds, which is difficult to analyze in the spatial domain but easily reflected in the frequency… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.18216v1-abstract-full').style.display = 'inline'; document.getElementById('2311.18216v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.18216v1-abstract-full" style="display: none;"> Banding artifact, as known as staircase-like contour, is a common quality annoyance that happens in compression, transmission, etc. scenarios, which largely affects the user's quality of experience (QoE). The banding distortion typically appears as relatively small pixel-wise variations in smooth backgrounds, which is difficult to analyze in the spatial domain but easily reflected in the frequency domain. In this paper, we thereby study the banding artifact from the frequency aspect and propose a no-reference banding detection model to capture and evaluate banding artifacts, called the Frequency-Sensitive BANding Detector (FS-BAND). The proposed detector is able to generate a pixel-wise banding map with a perception correlated quality score. Experimental results show that the proposed FS-BAND method outperforms state-of-the-art image quality assessment (IQA) approaches with higher accuracy in banding classification task. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.18216v1-abstract-full').style.display = 'none'; document.getElementById('2311.18216v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">arXiv admin note: substantial text overlap with arXiv:2311.17752</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.07033">arXiv:2311.07033</a> <span> [<a href="https://arxiv.org/pdf/2311.07033">pdf</a>, <a href="https://arxiv.org/format/2311.07033">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> TTMFN: Two-stream Transformer-based Multimodal Fusion Network for Survival Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ge%2C+R">Ruiquan Ge</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+X">Xiangyang Hu</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+R">Rungen Huang</a>, <a href="/search/eess?searchtype=author&query=Jia%2C+G">Gangyong Jia</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yaqi Wang</a>, <a href="/search/eess?searchtype=author&query=Gu%2C+R">Renshu Gu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+C">Changmiao Wang</a>, <a href="/search/eess?searchtype=author&query=Ahmed%2C+E">Elazab Ahmed</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+L">Linyan Wang</a>, <a href="/search/eess?searchtype=author&query=Ye%2C+J">Juan Ye</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Ye Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.07033v1-abstract-short" style="display: inline;"> Survival prediction plays a crucial role in assisting clinicians with the development of cancer treatment protocols. Recent evidence shows that multimodal data can help in the diagnosis of cancer disease and improve survival prediction. Currently, deep learning-based approaches have experienced increasing success in survival prediction by integrating pathological images and gene expression data. H… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.07033v1-abstract-full').style.display = 'inline'; document.getElementById('2311.07033v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.07033v1-abstract-full" style="display: none;"> Survival prediction plays a crucial role in assisting clinicians with the development of cancer treatment protocols. Recent evidence shows that multimodal data can help in the diagnosis of cancer disease and improve survival prediction. Currently, deep learning-based approaches have experienced increasing success in survival prediction by integrating pathological images and gene expression data. However, most existing approaches overlook the intra-modality latent information and the complex inter-modality correlations. Furthermore, existing modalities do not fully exploit the immense representational capabilities of neural networks for feature aggregation and disregard the importance of relationships between features. Therefore, it is highly recommended to address these issues in order to enhance the prediction performance by proposing a novel deep learning-based method. We propose a novel framework named Two-stream Transformer-based Multimodal Fusion Network for survival prediction (TTMFN), which integrates pathological images and gene expression data. In TTMFN, we present a two-stream multimodal co-attention transformer module to take full advantage of the complex relationships between different modalities and the potential connections within the modalities. Additionally, we develop a multi-head attention pooling approach to effectively aggregate the feature representations of the two modalities. The experiment results on four datasets from The Cancer Genome Atlas demonstrate that TTMFN can achieve the best performance or competitive results compared to the state-of-the-art methods in predicting the overall survival of patients. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.07033v1-abstract-full').style.display = 'none'; document.getElementById('2311.07033v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.17864">arXiv:2310.17864</a> <span> [<a href="https://arxiv.org/pdf/2310.17864">pdf</a>, <a href="https://arxiv.org/format/2310.17864">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> TorchAudio 2.1: Advancing speech recognition, self-supervised learning, and audio processing components for PyTorch </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Hwang%2C+J">Jeff Hwang</a>, <a href="/search/eess?searchtype=author&query=Hira%2C+M">Moto Hira</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+C">Caroline Chen</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+X">Xiaohui Zhang</a>, <a href="/search/eess?searchtype=author&query=Ni%2C+Z">Zhaoheng Ni</a>, <a href="/search/eess?searchtype=author&query=Sun%2C+G">Guangzhi Sun</a>, <a href="/search/eess?searchtype=author&query=Ma%2C+P">Pingchuan Ma</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+R">Ruizhe Huang</a>, <a href="/search/eess?searchtype=author&query=Pratap%2C+V">Vineel Pratap</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Y">Yuekai Zhang</a>, <a href="/search/eess?searchtype=author&query=Kumar%2C+A">Anurag Kumar</a>, <a href="/search/eess?searchtype=author&query=Yu%2C+C">Chin-Yun Yu</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+C">Chuang Zhu</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+C">Chunxi Liu</a>, <a href="/search/eess?searchtype=author&query=Kahn%2C+J">Jacob Kahn</a>, <a href="/search/eess?searchtype=author&query=Ravanelli%2C+M">Mirco Ravanelli</a>, <a href="/search/eess?searchtype=author&query=Sun%2C+P">Peng Sun</a>, <a href="/search/eess?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a>, <a href="/search/eess?searchtype=author&query=Shi%2C+Y">Yangyang Shi</a>, <a href="/search/eess?searchtype=author&query=Tao%2C+Y">Yumeng Tao</a>, <a href="/search/eess?searchtype=author&query=Scheibler%2C+R">Robin Scheibler</a>, <a href="/search/eess?searchtype=author&query=Cornell%2C+S">Samuele Cornell</a>, <a href="/search/eess?searchtype=author&query=Kim%2C+S">Sean Kim</a>, <a href="/search/eess?searchtype=author&query=Petridis%2C+S">Stavros Petridis</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.17864v1-abstract-short" style="display: inline;"> TorchAudio is an open-source audio and speech processing library built for PyTorch. It aims to accelerate the research and development of audio and speech technologies by providing well-designed, easy-to-use, and performant PyTorch components. Its contributors routinely engage with users to understand their needs and fulfill them by developing impactful features. Here, we survey TorchAudio's devel… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.17864v1-abstract-full').style.display = 'inline'; document.getElementById('2310.17864v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.17864v1-abstract-full" style="display: none;"> TorchAudio is an open-source audio and speech processing library built for PyTorch. It aims to accelerate the research and development of audio and speech technologies by providing well-designed, easy-to-use, and performant PyTorch components. Its contributors routinely engage with users to understand their needs and fulfill them by developing impactful features. Here, we survey TorchAudio's development principles and contents and highlight key features we include in its latest version (2.1): self-supervised learning pre-trained pipelines and training recipes, high-performance CTC decoders, speech recognition models and training recipes, advanced media I/O capabilities, and tools for performing forced alignment, multi-channel speech enhancement, and reference-less speech assessment. For a selection of these features, through empirical studies, we demonstrate their efficacy and show that they achieve competitive or state-of-the-art performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.17864v1-abstract-full').style.display = 'none'; document.getElementById('2310.17864v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.13015">arXiv:2310.13015</a> <span> [<a href="https://arxiv.org/pdf/2310.13015">pdf</a>, <a href="https://arxiv.org/format/2310.13015">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Audio-AdapterFusion: A Task-ID-free Approach for Efficient and Non-Destructive Multi-task Speech Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ngai%2C+H">Hillary Ngai</a>, <a href="/search/eess?searchtype=author&query=Agrawal%2C+R">Rohan Agrawal</a>, <a href="/search/eess?searchtype=author&query=Gaur%2C+N">Neeraj Gaur</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+R">Ronny Huang</a>, <a href="/search/eess?searchtype=author&query=Haghani%2C+P">Parisa Haghani</a>, <a href="/search/eess?searchtype=author&query=Mengibar%2C+P+M">Pedro Moreno Mengibar</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.13015v1-abstract-short" style="display: inline;"> Adapters are an efficient, composable alternative to full fine-tuning of pre-trained models and help scale the deployment of large ASR models to many tasks. In practice, a task ID is commonly prepended to the input during inference to route to single-task adapters for the specified task. However, one major limitation of this approach is that the task ID may not be known during inference, rendering… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.13015v1-abstract-full').style.display = 'inline'; document.getElementById('2310.13015v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.13015v1-abstract-full" style="display: none;"> Adapters are an efficient, composable alternative to full fine-tuning of pre-trained models and help scale the deployment of large ASR models to many tasks. In practice, a task ID is commonly prepended to the input during inference to route to single-task adapters for the specified task. However, one major limitation of this approach is that the task ID may not be known during inference, rendering it unsuitable for most multi-task settings. To address this, we propose three novel task-ID-free methods to combine single-task adapters in multi-task ASR and investigate two learning algorithms for training. We evaluate our methods on 10 test sets from 4 diverse ASR tasks and show that our methods are non-destructive and parameter-efficient. While only updating 17% of the model parameters, our methods can achieve an 8% mean WER improvement relative to full fine-tuning and are on-par with task-ID adapter routing. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.13015v1-abstract-full').style.display = 'none'; document.getElementById('2310.13015v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">2023 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU) Proceedings</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.05021">arXiv:2310.05021</a> <span> [<a href="https://arxiv.org/pdf/2310.05021">pdf</a>, <a href="https://arxiv.org/format/2310.05021">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Toward Intelligent Emergency Control for Large-scale Power Systems: Convergence of Learning, Physics, Computing and Control </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Huang%2C+Q">Qiuhua Huang</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+R">Renke Huang</a>, <a href="/search/eess?searchtype=author&query=Yin%2C+T">Tianzhixi Yin</a>, <a href="/search/eess?searchtype=author&query=Datta%2C+S">Sohom Datta</a>, <a href="/search/eess?searchtype=author&query=Sun%2C+X">Xueqing Sun</a>, <a href="/search/eess?searchtype=author&query=Hou%2C+J">Jason Hou</a>, <a href="/search/eess?searchtype=author&query=Tan%2C+J">Jie Tan</a>, <a href="/search/eess?searchtype=author&query=Yu%2C+W">Wenhao Yu</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+Y">Yuan Liu</a>, <a href="/search/eess?searchtype=author&query=Li%2C+X">Xinya Li</a>, <a href="/search/eess?searchtype=author&query=Palmer%2C+B">Bruce Palmer</a>, <a href="/search/eess?searchtype=author&query=Li%2C+A">Ang Li</a>, <a href="/search/eess?searchtype=author&query=Ke%2C+X">Xinda Ke</a>, <a href="/search/eess?searchtype=author&query=Vaiman%2C+M">Marianna Vaiman</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+S">Song Wang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yousu Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.05021v1-abstract-short" style="display: inline;"> This paper has delved into the pressing need for intelligent emergency control in large-scale power systems, which are experiencing significant transformations and are operating closer to their limits with more uncertainties. Learning-based control methods are promising and have shown effectiveness for intelligent power system control. However, when they are applied to large-scale power systems, t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.05021v1-abstract-full').style.display = 'inline'; document.getElementById('2310.05021v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.05021v1-abstract-full" style="display: none;"> This paper has delved into the pressing need for intelligent emergency control in large-scale power systems, which are experiencing significant transformations and are operating closer to their limits with more uncertainties. Learning-based control methods are promising and have shown effectiveness for intelligent power system control. However, when they are applied to large-scale power systems, there are multifaceted challenges such as scalability, adaptiveness, and security posed by the complex power system landscape, which demand comprehensive solutions. The paper first proposes and instantiates a convergence framework for integrating power systems physics, machine learning, advanced computing, and grid control to realize intelligent grid control at a large scale. Our developed methods and platform based on the convergence framework have been applied to a large (more than 3000 buses) Texas power system, and tested with 56000 scenarios. Our work achieved a 26% reduction in load shedding on average and outperformed existing rule-based control in 99.7% of the test scenarios. The results demonstrated the potential of the proposed convergence framework and DRL-based intelligent control for the future grid. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.05021v1-abstract-full').style.display = 'none'; document.getElementById('2310.05021v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">submitted to PSCC 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.00704">arXiv:2310.00704</a> <span> [<a href="https://arxiv.org/pdf/2310.00704">pdf</a>, <a href="https://arxiv.org/format/2310.00704">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> UniAudio: An Audio Foundation Model Toward Universal Audio Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yang%2C+D">Dongchao Yang</a>, <a href="/search/eess?searchtype=author&query=Tian%2C+J">Jinchuan Tian</a>, <a href="/search/eess?searchtype=author&query=Tan%2C+X">Xu Tan</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+R">Rongjie Huang</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+S">Songxiang Liu</a>, <a href="/search/eess?searchtype=author&query=Chang%2C+X">Xuankai Chang</a>, <a href="/search/eess?searchtype=author&query=Shi%2C+J">Jiatong Shi</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+S">Sheng Zhao</a>, <a href="/search/eess?searchtype=author&query=Bian%2C+J">Jiang Bian</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+X">Xixin Wu</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+Z">Zhou Zhao</a>, <a href="/search/eess?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a>, <a href="/search/eess?searchtype=author&query=Meng%2C+H">Helen Meng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.00704v5-abstract-short" style="display: inline;"> Large Language models (LLM) have demonstrated the capability to handle a variety of generative tasks. This paper presents the UniAudio system, which, unlike prior task-specific approaches, leverages LLM techniques to generate multiple types of audio (including speech, sounds, music, and singing) with given input conditions. UniAudio 1) first tokenizes all types of target audio along with other con… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.00704v5-abstract-full').style.display = 'inline'; document.getElementById('2310.00704v5-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.00704v5-abstract-full" style="display: none;"> Large Language models (LLM) have demonstrated the capability to handle a variety of generative tasks. This paper presents the UniAudio system, which, unlike prior task-specific approaches, leverages LLM techniques to generate multiple types of audio (including speech, sounds, music, and singing) with given input conditions. UniAudio 1) first tokenizes all types of target audio along with other condition modalities, 2) concatenates source-target pair as a single sequence, and 3) performs next-token prediction using LLM. Also, a multi-scale Transformer model is proposed to handle the overly long sequences caused by the residual vector quantization based neural codec in tokenization. Training of UniAudio is scaled up to 165K hours of audio and 1B parameters, based on all generative tasks, aiming to obtain sufficient prior knowledge not only in the intrinsic properties of audio but also the inter-relationship between audio and other modalities. Therefore, the trained UniAudio model has the potential to become a foundation model for universal audio generation: it shows strong capability in all trained tasks and can seamlessly support new audio generation tasks after simple fine-tuning. Experiments demonstrate that UniAudio achieves state-of-the-art or at least competitive results on most of the 11 tasks. Demo and code are released at https://github.com/yangdongchao/UniAudio <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.00704v5-abstract-full').style.display = 'none'; document.getElementById('2310.00704v5-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.07566">arXiv:2309.07566</a> <span> [<a href="https://arxiv.org/pdf/2309.07566">pdf</a>, <a href="https://arxiv.org/format/2309.07566">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Speech-to-Speech Translation with Discrete-Unit-Based Style Transfer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yongqi Wang</a>, <a href="/search/eess?searchtype=author&query=Bai%2C+J">Jionghao Bai</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+R">Rongjie Huang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+R">Ruiqi Li</a>, <a href="/search/eess?searchtype=author&query=Hong%2C+Z">Zhiqing Hong</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+Z">Zhou Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.07566v2-abstract-short" style="display: inline;"> Direct speech-to-speech translation (S2ST) with discrete self-supervised representations has achieved remarkable accuracy, but is unable to preserve the speaker timbre of the source speech. Meanwhile, the scarcity of high-quality speaker-parallel data poses a challenge for learning style transfer during translation. We design an S2ST pipeline with style-transfer capability on the basis of discrete… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.07566v2-abstract-full').style.display = 'inline'; document.getElementById('2309.07566v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.07566v2-abstract-full" style="display: none;"> Direct speech-to-speech translation (S2ST) with discrete self-supervised representations has achieved remarkable accuracy, but is unable to preserve the speaker timbre of the source speech. Meanwhile, the scarcity of high-quality speaker-parallel data poses a challenge for learning style transfer during translation. We design an S2ST pipeline with style-transfer capability on the basis of discrete self-supervised speech representations and codec units. The acoustic language model we introduce for style transfer leverages self-supervised in-context learning, acquiring style transfer ability without relying on any speaker-parallel data, thereby overcoming data scarcity. By using extensive training data, our model achieves zero-shot cross-lingual style transfer on previously unseen source languages. Experiments show that our model generates translated speeches with high fidelity and speaker similarity. Audio samples are available at http://stylelm.github.io/ . <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.07566v2-abstract-full').style.display = 'none'; document.getElementById('2309.07566v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted by ACL SRW 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2307.02747">arXiv:2307.02747</a> <span> [<a href="https://arxiv.org/pdf/2307.02747">pdf</a>, <a href="https://arxiv.org/format/2307.02747">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/WCNC55385.2023.10118995">10.1109/WCNC55385.2023.10118995 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Computing Offloading and Semantic Compression for Intelligent Computing Tasks in MEC Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zheng%2C+Y">Yuanpeng Zheng</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+T">Tiankui Zhang</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+R">Rong Huang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yapeng Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2307.02747v1-abstract-short" style="display: inline;"> This paper investigates the intelligent computing task-oriented computing offloading and semantic compression in mobile edge computing (MEC) systems. With the popularity of intelligent applications in various industries, terminals increasingly need to offload intelligent computing tasks with complex demands to MEC servers for computing, which is a great challenge for bandwidth and computing capaci… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.02747v1-abstract-full').style.display = 'inline'; document.getElementById('2307.02747v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2307.02747v1-abstract-full" style="display: none;"> This paper investigates the intelligent computing task-oriented computing offloading and semantic compression in mobile edge computing (MEC) systems. With the popularity of intelligent applications in various industries, terminals increasingly need to offload intelligent computing tasks with complex demands to MEC servers for computing, which is a great challenge for bandwidth and computing capacity allocation in MEC systems. Considering the accuracy requirement of intelligent computing tasks, we formulate an optimization problem of computing offloading and semantic compression. We jointly optimize the system utility which are represented as computing accuracy and task delay respectively to acquire the optimized system utility. To solve the proposed optimization problem, we decompose it into computing capacity allocation subproblem and compression offloading subproblem and obtain solutions through convex optimization and successive convex approximation. After that, the offloading decisions, computing capacity and compressed ratio are obtained in closed forms. We design the computing offloading and semantic compression algorithm for intelligent computing tasks in MEC systems then. Simulation results represent that our algorithm converges quickly and acquires better performance and resource utilization efficiency through the trend with total number of users and computing capacity compared with benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.02747v1-abstract-full').style.display = 'none'; document.getElementById('2307.02747v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.08280">arXiv:2306.08280</a> <span> [<a href="https://arxiv.org/pdf/2306.08280">pdf</a>, <a href="https://arxiv.org/format/2306.08280">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Differentially Private Wireless Federated Learning Using Orthogonal Sequences </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wei%2C+X">Xizixiang Wei</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+T">Tianhao Wang</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+R">Ruiquan Huang</a>, <a href="/search/eess?searchtype=author&query=Shen%2C+C">Cong Shen</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+J">Jing Yang</a>, <a href="/search/eess?searchtype=author&query=Poor%2C+H+V">H. Vincent Poor</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.08280v2-abstract-short" style="display: inline;"> We propose a privacy-preserving uplink over-the-air computation (AirComp) method, termed FLORAS, for single-input single-output (SISO) wireless federated learning (FL) systems. From the perspective of communication designs, FLORAS eliminates the requirement of channel state information at the transmitters (CSIT) by leveraging the properties of orthogonal sequences. From the privacy perspective, we… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.08280v2-abstract-full').style.display = 'inline'; document.getElementById('2306.08280v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.08280v2-abstract-full" style="display: none;"> We propose a privacy-preserving uplink over-the-air computation (AirComp) method, termed FLORAS, for single-input single-output (SISO) wireless federated learning (FL) systems. From the perspective of communication designs, FLORAS eliminates the requirement of channel state information at the transmitters (CSIT) by leveraging the properties of orthogonal sequences. From the privacy perspective, we prove that FLORAS offers both item-level and client-level differential privacy (DP) guarantees. Moreover, by properly adjusting the system parameters, FLORAS can flexibly achieve different DP levels at no additional cost. A new FL convergence bound is derived which, combined with the privacy guarantees, allows for a smooth tradeoff between the achieved convergence rate and differential privacy levels. Experimental results demonstrate the advantages of FLORAS compared with the baseline AirComp method, and validate that the analytical results can guide the design of privacy-preserving FL with different tradeoff requirements on the model convergence and privacy levels. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.08280v2-abstract-full').style.display = 'none'; document.getElementById('2306.08280v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">33 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.08133">arXiv:2306.08133</a> <span> [<a href="https://arxiv.org/pdf/2306.08133">pdf</a>, <a href="https://arxiv.org/ps/2306.08133">ps</a>, <a href="https://arxiv.org/format/2306.08133">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Large-scale Language Model Rescoring on Long-form Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Chen%2C+T">Tongzhou Chen</a>, <a href="/search/eess?searchtype=author&query=Allauzen%2C+C">Cyril Allauzen</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+Y">Yinghui Huang</a>, <a href="/search/eess?searchtype=author&query=Park%2C+D">Daniel Park</a>, <a href="/search/eess?searchtype=author&query=Rybach%2C+D">David Rybach</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+W+R">W. Ronny Huang</a>, <a href="/search/eess?searchtype=author&query=Cabrera%2C+R">Rodrigo Cabrera</a>, <a href="/search/eess?searchtype=author&query=Audhkhasi%2C+K">Kartik Audhkhasi</a>, <a href="/search/eess?searchtype=author&query=Ramabhadran%2C+B">Bhuvana Ramabhadran</a>, <a href="/search/eess?searchtype=author&query=Moreno%2C+P+J">Pedro J. Moreno</a>, <a href="/search/eess?searchtype=author&query=Riley%2C+M">Michael Riley</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.08133v2-abstract-short" style="display: inline;"> In this work, we study the impact of Large-scale Language Models (LLM) on Automated Speech Recognition (ASR) of YouTube videos, which we use as a source for long-form ASR. We demonstrate up to 8\% relative reduction in Word Error Eate (WER) on US English (en-us) and code-switched Indian English (en-in) long-form ASR test sets and a reduction of up to 30\% relative on Salient Term Error Rate (STER)… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.08133v2-abstract-full').style.display = 'inline'; document.getElementById('2306.08133v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.08133v2-abstract-full" style="display: none;"> In this work, we study the impact of Large-scale Language Models (LLM) on Automated Speech Recognition (ASR) of YouTube videos, which we use as a source for long-form ASR. We demonstrate up to 8\% relative reduction in Word Error Eate (WER) on US English (en-us) and code-switched Indian English (en-in) long-form ASR test sets and a reduction of up to 30\% relative on Salient Term Error Rate (STER) over a strong first-pass baseline that uses a maximum-entropy based language model. Improved lattice processing that results in a lattice with a proper (non-tree) digraph topology and carrying context from the 1-best hypothesis of the previous segment(s) results in significant wins in rescoring with LLMs. We also find that the gains in performance from the combination of LLMs trained on vast quantities of available data (such as C4) and conventional neural LMs is additive and significantly outperforms a strong first-pass baseline with a maximum entropy LM. Copyright 2023 IEEE. Personal use of this material is permitted. Permission from IEEE must be obtained for all other uses, in any current or future media, including reprinting/republishing this material for advertising or promotional purposes, creating new collective works, for resale or redistribution to servers or lists, or reuse of any copyrighted component of this work in other works. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.08133v2-abstract-full').style.display = 'none'; document.getElementById('2306.08133v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, accepted in ICASSP 2023</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP) </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.03509">arXiv:2306.03509</a> <span> [<a href="https://arxiv.org/pdf/2306.03509">pdf</a>, <a href="https://arxiv.org/format/2306.03509">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Mega-TTS: Zero-Shot Text-to-Speech at Scale with Intrinsic Inductive Bias </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Jiang%2C+Z">Ziyue Jiang</a>, <a href="/search/eess?searchtype=author&query=Ren%2C+Y">Yi Ren</a>, <a href="/search/eess?searchtype=author&query=Ye%2C+Z">Zhenhui Ye</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+J">Jinglin Liu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+C">Chen Zhang</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+Q">Qian Yang</a>, <a href="/search/eess?searchtype=author&query=Ji%2C+S">Shengpeng Ji</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+R">Rongjie Huang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+C">Chunfeng Wang</a>, <a href="/search/eess?searchtype=author&query=Yin%2C+X">Xiang Yin</a>, <a href="/search/eess?searchtype=author&query=Ma%2C+Z">Zejun Ma</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+Z">Zhou Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.03509v1-abstract-short" style="display: inline;"> Scaling text-to-speech to a large and wild dataset has been proven to be highly effective in achieving timbre and speech style generalization, particularly in zero-shot TTS. However, previous works usually encode speech into latent using audio codec and use autoregressive language models or diffusion models to generate it, which ignores the intrinsic nature of speech and may lead to inferior or un… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.03509v1-abstract-full').style.display = 'inline'; document.getElementById('2306.03509v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.03509v1-abstract-full" style="display: none;"> Scaling text-to-speech to a large and wild dataset has been proven to be highly effective in achieving timbre and speech style generalization, particularly in zero-shot TTS. However, previous works usually encode speech into latent using audio codec and use autoregressive language models or diffusion models to generate it, which ignores the intrinsic nature of speech and may lead to inferior or uncontrollable results. We argue that speech can be decomposed into several attributes (e.g., content, timbre, prosody, and phase) and each of them should be modeled using a module with appropriate inductive biases. From this perspective, we carefully design a novel and large zero-shot TTS system called Mega-TTS, which is trained with large-scale wild data and models different attributes in different ways: 1) Instead of using latent encoded by audio codec as the intermediate feature, we still choose spectrogram as it separates the phase and other attributes very well. Phase can be appropriately constructed by the GAN-based vocoder and does not need to be modeled by the language model. 2) We model the timbre using global vectors since timbre is a global attribute that changes slowly over time. 3) We further use a VQGAN-based acoustic model to generate the spectrogram and a latent code language model to fit the distribution of prosody, since prosody changes quickly over time in a sentence, and language models can capture both local and long-range dependencies. We scale Mega-TTS to multi-domain datasets with 20K hours of speech and evaluate its performance on unseen speakers. Experimental results demonstrate that Mega-TTS surpasses state-of-the-art TTS systems on zero-shot TTS, speech editing, and cross-lingual TTS tasks, with superior naturalness, robustness, and speaker similarity due to the proper inductive bias of each module. Audio samples are available at https://mega-tts.github.io/demo-page. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.03509v1-abstract-full').style.display = 'none'; document.getElementById('2306.03509v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.19269">arXiv:2305.19269</a> <span> [<a href="https://arxiv.org/pdf/2305.19269">pdf</a>, <a href="https://arxiv.org/format/2305.19269">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Make-A-Voice: Unified Voice Synthesis With Discrete Representation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Huang%2C+R">Rongjie Huang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+C">Chunlei Zhang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yongqi Wang</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+D">Dongchao Yang</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+L">Luping Liu</a>, <a href="/search/eess?searchtype=author&query=Ye%2C+Z">Zhenhui Ye</a>, <a href="/search/eess?searchtype=author&query=Jiang%2C+Z">Ziyue Jiang</a>, <a href="/search/eess?searchtype=author&query=Weng%2C+C">Chao Weng</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+Z">Zhou Zhao</a>, <a href="/search/eess?searchtype=author&query=Yu%2C+D">Dong Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.19269v1-abstract-short" style="display: inline;"> Various applications of voice synthesis have been developed independently despite the fact that they generate "voice" as output in common. In addition, the majority of voice synthesis models currently rely on annotated audio data, but it is crucial to scale them to self-supervised datasets in order to effectively capture the wide range of acoustic variations present in human voice, including speak… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.19269v1-abstract-full').style.display = 'inline'; document.getElementById('2305.19269v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.19269v1-abstract-full" style="display: none;"> Various applications of voice synthesis have been developed independently despite the fact that they generate "voice" as output in common. In addition, the majority of voice synthesis models currently rely on annotated audio data, but it is crucial to scale them to self-supervised datasets in order to effectively capture the wide range of acoustic variations present in human voice, including speaker identity, emotion, and prosody. In this work, we propose Make-A-Voice, a unified framework for synthesizing and manipulating voice signals from discrete representations. Make-A-Voice leverages a "coarse-to-fine" approach to model the human voice, which involves three stages: 1) semantic stage: model high-level transformation between linguistic content and self-supervised semantic tokens, 2) acoustic stage: introduce varying control signals as acoustic conditions for semantic-to-acoustic modeling, and 3) generation stage: synthesize high-fidelity waveforms from acoustic tokens. Make-A-Voice offers notable benefits as a unified voice synthesis framework: 1) Data scalability: the major backbone (i.e., acoustic and generation stage) does not require any annotations, and thus the training data could be scaled up. 2) Controllability and conditioning flexibility: we investigate different conditioning mechanisms and effectively handle three voice synthesis applications, including text-to-speech (TTS), voice conversion (VC), and singing voice synthesis (SVS) by re-synthesizing the discrete voice representations with prompt guidance. Experimental results demonstrate that Make-A-Voice exhibits superior audio quality and style similarity compared with competitive baseline models. Audio samples are available at https://Make-A-Voice.github.io <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.19269v1-abstract-full').style.display = 'none'; document.getElementById('2305.19269v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.18474">arXiv:2305.18474</a> <span> [<a href="https://arxiv.org/pdf/2305.18474">pdf</a>, <a href="https://arxiv.org/format/2305.18474">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Make-An-Audio 2: Temporal-Enhanced Text-to-Audio Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Huang%2C+J">Jiawei Huang</a>, <a href="/search/eess?searchtype=author&query=Ren%2C+Y">Yi Ren</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+R">Rongjie Huang</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+D">Dongchao Yang</a>, <a href="/search/eess?searchtype=author&query=Ye%2C+Z">Zhenhui Ye</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+C">Chen Zhang</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+J">Jinglin Liu</a>, <a href="/search/eess?searchtype=author&query=Yin%2C+X">Xiang Yin</a>, <a href="/search/eess?searchtype=author&query=Ma%2C+Z">Zejun Ma</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+Z">Zhou Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.18474v1-abstract-short" style="display: inline;"> Large diffusion models have been successful in text-to-audio (T2A) synthesis tasks, but they often suffer from common issues such as semantic misalignment and poor temporal consistency due to limited natural language understanding and data scarcity. Additionally, 2D spatial structures widely used in T2A works lead to unsatisfactory audio quality when generating variable-length audio samples since… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.18474v1-abstract-full').style.display = 'inline'; document.getElementById('2305.18474v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.18474v1-abstract-full" style="display: none;"> Large diffusion models have been successful in text-to-audio (T2A) synthesis tasks, but they often suffer from common issues such as semantic misalignment and poor temporal consistency due to limited natural language understanding and data scarcity. Additionally, 2D spatial structures widely used in T2A works lead to unsatisfactory audio quality when generating variable-length audio samples since they do not adequately prioritize temporal information. To address these challenges, we propose Make-an-Audio 2, a latent diffusion-based T2A method that builds on the success of Make-an-Audio. Our approach includes several techniques to improve semantic alignment and temporal consistency: Firstly, we use pre-trained large language models (LLMs) to parse the text into structured <event & order> pairs for better temporal information capture. We also introduce another structured-text encoder to aid in learning semantic alignment during the diffusion denoising process. To improve the performance of variable length generation and enhance the temporal information extraction, we design a feed-forward Transformer-based diffusion denoiser. Finally, we use LLMs to augment and transform a large amount of audio-label data into audio-text datasets to alleviate the problem of scarcity of temporal data. Extensive experiments show that our method outperforms baseline models in both objective and subjective metrics, and achieves significant gains in temporal information understanding, semantic consistency, and sound quality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.18474v1-abstract-full').style.display = 'none'; document.getElementById('2305.18474v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.18419">arXiv:2305.18419</a> <span> [<a href="https://arxiv.org/pdf/2305.18419">pdf</a>, <a href="https://arxiv.org/format/2305.18419">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Semantic Segmentation with Bidirectional Language Models Improves Long-form ASR </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Huang%2C+W+R">W. Ronny Huang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+H">Hao Zhang</a>, <a href="/search/eess?searchtype=author&query=Kumar%2C+S">Shankar Kumar</a>, <a href="/search/eess?searchtype=author&query=Chang%2C+S">Shuo-yiin Chang</a>, <a href="/search/eess?searchtype=author&query=Sainath%2C+T+N">Tara N. Sainath</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.18419v1-abstract-short" style="display: inline;"> We propose a method of segmenting long-form speech by separating semantically complete sentences within the utterance. This prevents the ASR decoder from needlessly processing faraway context while also preventing it from missing relevant context within the current sentence. Semantically complete sentence boundaries are typically demarcated by punctuation in written text; but unfortunately, spoken… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.18419v1-abstract-full').style.display = 'inline'; document.getElementById('2305.18419v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.18419v1-abstract-full" style="display: none;"> We propose a method of segmenting long-form speech by separating semantically complete sentences within the utterance. This prevents the ASR decoder from needlessly processing faraway context while also preventing it from missing relevant context within the current sentence. Semantically complete sentence boundaries are typically demarcated by punctuation in written text; but unfortunately, spoken real-world utterances rarely contain punctuation. We address this limitation by distilling punctuation knowledge from a bidirectional teacher language model (LM) trained on written, punctuated text. We compare our segmenter, which is distilled from the LM teacher, against a segmenter distilled from a acoustic-pause-based teacher used in other works, on a streaming ASR pipeline. The pipeline with our segmenter achieves a 3.2% relative WER gain along with a 60 ms median end-of-segment latency reduction on a YouTube captioning task. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.18419v1-abstract-full').style.display = 'none'; document.getElementById('2305.18419v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Interspeech 2023. First 3 authors contributed equally</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.15403">arXiv:2305.15403</a> <span> [<a href="https://arxiv.org/pdf/2305.15403">pdf</a>, <a href="https://arxiv.org/format/2305.15403">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> AV-TranSpeech: Audio-Visual Robust Speech-to-Speech Translation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Huang%2C+R">Rongjie Huang</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+H">Huadai Liu</a>, <a href="/search/eess?searchtype=author&query=Cheng%2C+X">Xize Cheng</a>, <a href="/search/eess?searchtype=author&query=Ren%2C+Y">Yi Ren</a>, <a href="/search/eess?searchtype=author&query=Li%2C+L">Linjun Li</a>, <a href="/search/eess?searchtype=author&query=Ye%2C+Z">Zhenhui Ye</a>, <a href="/search/eess?searchtype=author&query=He%2C+J">Jinzheng He</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+L">Lichao Zhang</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+J">Jinglin Liu</a>, <a href="/search/eess?searchtype=author&query=Yin%2C+X">Xiang Yin</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+Z">Zhou Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.15403v1-abstract-short" style="display: inline;"> Direct speech-to-speech translation (S2ST) aims to convert speech from one language into another, and has demonstrated significant progress to date. Despite the recent success, current S2ST models still suffer from distinct degradation in noisy environments and fail to translate visual speech (i.e., the movement of lips and teeth). In this work, we present AV-TranSpeech, the first audio-visual spe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.15403v1-abstract-full').style.display = 'inline'; document.getElementById('2305.15403v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.15403v1-abstract-full" style="display: none;"> Direct speech-to-speech translation (S2ST) aims to convert speech from one language into another, and has demonstrated significant progress to date. Despite the recent success, current S2ST models still suffer from distinct degradation in noisy environments and fail to translate visual speech (i.e., the movement of lips and teeth). In this work, we present AV-TranSpeech, the first audio-visual speech-to-speech (AV-S2ST) translation model without relying on intermediate text. AV-TranSpeech complements the audio stream with visual information to promote system robustness and opens up a host of practical applications: dictation or dubbing archival films. To mitigate the data scarcity with limited parallel AV-S2ST data, we 1) explore self-supervised pre-training with unlabeled audio-visual data to learn contextual representation, and 2) introduce cross-modal distillation with S2ST models trained on the audio-only corpus to further reduce the requirements of visual data. Experimental results on two language pairs demonstrate that AV-TranSpeech outperforms audio-only models under all settings regardless of the type of noise. With low-resource audio-visual data (10h, 30h), cross-modal distillation yields an improvement of 7.6 BLEU on average compared with baselines. Audio samples are available at https://AV-TranSpeech.github.io <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.15403v1-abstract-full').style.display = 'none'; document.getElementById('2305.15403v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ACL 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.13612">arXiv:2305.13612</a> <span> [<a href="https://arxiv.org/pdf/2305.13612">pdf</a>, <a href="https://arxiv.org/format/2305.13612">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> FluentSpeech: Stutter-Oriented Automatic Speech Editing with Context-Aware Diffusion Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Jiang%2C+Z">Ziyue Jiang</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+Q">Qian Yang</a>, <a href="/search/eess?searchtype=author&query=Zuo%2C+J">Jialong Zuo</a>, <a href="/search/eess?searchtype=author&query=Ye%2C+Z">Zhenhui Ye</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+R">Rongjie Huang</a>, <a href="/search/eess?searchtype=author&query=Ren%2C+Y">Yi Ren</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+Z">Zhou Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.13612v1-abstract-short" style="display: inline;"> Stutter removal is an essential scenario in the field of speech editing. However, when the speech recording contains stutters, the existing text-based speech editing approaches still suffer from: 1) the over-smoothing problem in the edited speech; 2) lack of robustness due to the noise introduced by stutter; 3) to remove the stutters, users are required to determine the edited region manually. To… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.13612v1-abstract-full').style.display = 'inline'; document.getElementById('2305.13612v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.13612v1-abstract-full" style="display: none;"> Stutter removal is an essential scenario in the field of speech editing. However, when the speech recording contains stutters, the existing text-based speech editing approaches still suffer from: 1) the over-smoothing problem in the edited speech; 2) lack of robustness due to the noise introduced by stutter; 3) to remove the stutters, users are required to determine the edited region manually. To tackle the challenges in stutter removal, we propose FluentSpeech, a stutter-oriented automatic speech editing model. Specifically, 1) we propose a context-aware diffusion model that iteratively refines the modified mel-spectrogram with the guidance of context features; 2) we introduce a stutter predictor module to inject the stutter information into the hidden sequence; 3) we also propose a stutter-oriented automatic speech editing (SASE) dataset that contains spontaneous speech recordings with time-aligned stutter labels to train the automatic stutter localization model. Experimental results on VCTK and LibriTTS datasets demonstrate that our model achieves state-of-the-art performance on speech editing. Further experiments on our SASE dataset show that FluentSpeech can effectively improve the fluency of stuttering speech in terms of objective and subjective metrics. Code and audio samples can be found at https://github.com/Zain-Jiang/Speech-Editing-Toolkit. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.13612v1-abstract-full').style.display = 'none'; document.getElementById('2305.13612v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ACL 2023 (Findings)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.12708">arXiv:2305.12708</a> <span> [<a href="https://arxiv.org/pdf/2305.12708">pdf</a>, <a href="https://arxiv.org/format/2305.12708">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> ViT-TTS: Visual Text-to-Speech with Scalable Diffusion Transformer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Liu%2C+H">Huadai Liu</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+R">Rongjie Huang</a>, <a href="/search/eess?searchtype=author&query=Lin%2C+X">Xuan Lin</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+W">Wenqiang Xu</a>, <a href="/search/eess?searchtype=author&query=Zheng%2C+M">Maozong Zheng</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+H">Hong Chen</a>, <a href="/search/eess?searchtype=author&query=He%2C+J">Jinzheng He</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+Z">Zhou Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.12708v2-abstract-short" style="display: inline;"> Text-to-speech(TTS) has undergone remarkable improvements in performance, particularly with the advent of Denoising Diffusion Probabilistic Models (DDPMs). However, the perceived quality of audio depends not solely on its content, pitch, rhythm, and energy, but also on the physical environment. In this work, we propose ViT-TTS, the first visual TTS model with scalable diffusion transformers. ViT-T… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.12708v2-abstract-full').style.display = 'inline'; document.getElementById('2305.12708v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.12708v2-abstract-full" style="display: none;"> Text-to-speech(TTS) has undergone remarkable improvements in performance, particularly with the advent of Denoising Diffusion Probabilistic Models (DDPMs). However, the perceived quality of audio depends not solely on its content, pitch, rhythm, and energy, but also on the physical environment. In this work, we propose ViT-TTS, the first visual TTS model with scalable diffusion transformers. ViT-TTS complement the phoneme sequence with the visual information to generate high-perceived audio, opening up new avenues for practical applications of AR and VR to allow a more immersive and realistic audio experience. To mitigate the data scarcity in learning visual acoustic information, we 1) introduce a self-supervised learning framework to enhance both the visual-text encoder and denoiser decoder; 2) leverage the diffusion transformer scalable in terms of parameters and capacity to learn visual scene information. Experimental results demonstrate that ViT-TTS achieves new state-of-the-art results, outperforming cascaded systems and other baselines regardless of the visibility of the scene. With low-resource data (1h, 2h, 5h), ViT-TTS achieves comparative results with rich-resource baselines.~\footnote{Audio samples are available at \url{https://ViT-TTS.github.io/.}} <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.12708v2-abstract-full').style.display = 'none'; document.getElementById('2305.12708v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by EMNLP 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.12552">arXiv:2305.12552</a> <span> [<a href="https://arxiv.org/pdf/2305.12552">pdf</a>, <a href="https://arxiv.org/format/2305.12552">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Wav2SQL: Direct Generalizable Speech-To-SQL Parsing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Liu%2C+H">Huadai Liu</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+R">Rongjie Huang</a>, <a href="/search/eess?searchtype=author&query=He%2C+J">Jinzheng He</a>, <a href="/search/eess?searchtype=author&query=Sun%2C+G">Gang Sun</a>, <a href="/search/eess?searchtype=author&query=Shen%2C+R">Ran Shen</a>, <a href="/search/eess?searchtype=author&query=Cheng%2C+X">Xize Cheng</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+Z">Zhou Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.12552v1-abstract-short" style="display: inline;"> Speech-to-SQL (S2SQL) aims to convert spoken questions into SQL queries given relational databases, which has been traditionally implemented in a cascaded manner while facing the following challenges: 1) model training is faced with the major issue of data scarcity, where limited parallel data is available; and 2) the systems should be robust enough to handle diverse out-of-domain speech samples t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.12552v1-abstract-full').style.display = 'inline'; document.getElementById('2305.12552v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.12552v1-abstract-full" style="display: none;"> Speech-to-SQL (S2SQL) aims to convert spoken questions into SQL queries given relational databases, which has been traditionally implemented in a cascaded manner while facing the following challenges: 1) model training is faced with the major issue of data scarcity, where limited parallel data is available; and 2) the systems should be robust enough to handle diverse out-of-domain speech samples that differ from the source data. In this work, we propose the first direct speech-to-SQL parsing model Wav2SQL which avoids error compounding across cascaded systems. Specifically, 1) to accelerate speech-driven SQL parsing research in the community, we release a large-scale and multi-speaker dataset MASpider; 2) leveraging the recent progress in the large-scale pre-training, we show that it alleviates the data scarcity issue and allow for direct speech-to-SQL parsing; and 3) we include the speech re-programming and gradient reversal classifier techniques to reduce acoustic variance and learned style-agnostic representation, improving generalization to unseen out-of-domain custom data. Experimental results demonstrate that Wav2SQL avoids error compounding and achieves state-of-the-art results by up to 2.5\% accuracy improvement over the baseline. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.12552v1-abstract-full').style.display = 'none'; document.getElementById('2305.12552v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.10763">arXiv:2305.10763</a> <span> [<a href="https://arxiv.org/pdf/2305.10763">pdf</a>, <a href="https://arxiv.org/format/2305.10763">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> CLAPSpeech: Learning Prosody from Text Context with Contrastive Language-Audio Pre-training </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ye%2C+Z">Zhenhui Ye</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+R">Rongjie Huang</a>, <a href="/search/eess?searchtype=author&query=Ren%2C+Y">Yi Ren</a>, <a href="/search/eess?searchtype=author&query=Jiang%2C+Z">Ziyue Jiang</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+J">Jinglin Liu</a>, <a href="/search/eess?searchtype=author&query=He%2C+J">Jinzheng He</a>, <a href="/search/eess?searchtype=author&query=Yin%2C+X">Xiang Yin</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+Z">Zhou Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.10763v1-abstract-short" style="display: inline;"> Improving text representation has attracted much attention to achieve expressive text-to-speech (TTS). However, existing works only implicitly learn the prosody with masked token reconstruction tasks, which leads to low training efficiency and difficulty in prosody modeling. We propose CLAPSpeech, a cross-modal contrastive pre-training framework that explicitly learns the prosody variance of the s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.10763v1-abstract-full').style.display = 'inline'; document.getElementById('2305.10763v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.10763v1-abstract-full" style="display: none;"> Improving text representation has attracted much attention to achieve expressive text-to-speech (TTS). However, existing works only implicitly learn the prosody with masked token reconstruction tasks, which leads to low training efficiency and difficulty in prosody modeling. We propose CLAPSpeech, a cross-modal contrastive pre-training framework that explicitly learns the prosody variance of the same text token under different contexts. Specifically, 1) We encourage the model to connect the text context with its corresponding prosody pattern in the joint multi-modal space with the elaborate design of the encoder inputs and contrastive loss; 2) We introduce a multi-scale pre-training pipeline to capture prosody patterns in multiple levels. We show how to incorporate CLAPSpeech into existing TTS models for better prosody. Experiments on three datasets not only show that CLAPSpeech could improve the prosody prediction for existing TTS methods, but also demonstrate its generalization ability to adapt to multiple languages and multi-speaker TTS. We also deeply analyze the principle behind the performance of CLAPSpeech. Ablation studies demonstrate the necessity of each component in our method. Source code and audio samples are available at https://clapspeech.github.io. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.10763v1-abstract-full').style.display = 'none'; document.getElementById('2305.10763v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ACL 2023 (Main Conference)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.10686">arXiv:2305.10686</a> <span> [<a href="https://arxiv.org/pdf/2305.10686">pdf</a>, <a href="https://arxiv.org/format/2305.10686">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> RMSSinger: Realistic-Music-Score based Singing Voice Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=He%2C+J">Jinzheng He</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+J">Jinglin Liu</a>, <a href="/search/eess?searchtype=author&query=Ye%2C+Z">Zhenhui Ye</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+R">Rongjie Huang</a>, <a href="/search/eess?searchtype=author&query=Cui%2C+C">Chenye Cui</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+H">Huadai Liu</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+Z">Zhou Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.10686v1-abstract-short" style="display: inline;"> We are interested in a challenging task, Realistic-Music-Score based Singing Voice Synthesis (RMS-SVS). RMS-SVS aims to generate high-quality singing voices given realistic music scores with different note types (grace, slur, rest, etc.). Though significant progress has been achieved, recent singing voice synthesis (SVS) methods are limited to fine-grained music scores, which require a complicated… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.10686v1-abstract-full').style.display = 'inline'; document.getElementById('2305.10686v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.10686v1-abstract-full" style="display: none;"> We are interested in a challenging task, Realistic-Music-Score based Singing Voice Synthesis (RMS-SVS). RMS-SVS aims to generate high-quality singing voices given realistic music scores with different note types (grace, slur, rest, etc.). Though significant progress has been achieved, recent singing voice synthesis (SVS) methods are limited to fine-grained music scores, which require a complicated data collection pipeline with time-consuming manual annotation to align music notes with phonemes. Furthermore, these manual annotation destroys the regularity of note durations in music scores, making fine-grained music scores inconvenient for composing. To tackle these challenges, we propose RMSSinger, the first RMS-SVS method, which takes realistic music scores as input, eliminating most of the tedious manual annotation and avoiding the aforementioned inconvenience. Note that music scores are based on words rather than phonemes, in RMSSinger, we introduce word-level modeling to avoid the time-consuming phoneme duration annotation and the complicated phoneme-level mel-note alignment. Furthermore, we propose the first diffusion-based pitch modeling method, which ameliorates the naturalness of existing pitch-modeling methods. To achieve these, we collect a new dataset containing realistic music scores and singing voices according to these realistic music scores from professional singers. Extensive experiments on the dataset demonstrate the effectiveness of our methods. Audio samples are available at https://rmssinger.github.io/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.10686v1-abstract-full').style.display = 'none'; document.getElementById('2305.10686v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by Finding of ACL2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.04476">arXiv:2305.04476</a> <span> [<a href="https://arxiv.org/pdf/2305.04476">pdf</a>, <a href="https://arxiv.org/format/2305.04476">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> AlignSTS: Speech-to-Singing Conversion via Cross-Modal Alignment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+R">Ruiqi Li</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+R">Rongjie Huang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+L">Lichao Zhang</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+J">Jinglin Liu</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+Z">Zhou Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.04476v4-abstract-short" style="display: inline;"> The speech-to-singing (STS) voice conversion task aims to generate singing samples corresponding to speech recordings while facing a major challenge: the alignment between the target (singing) pitch contour and the source (speech) content is difficult to learn in a text-free situation. This paper proposes AlignSTS, an STS model based on explicit cross-modal alignment, which views speech variance s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.04476v4-abstract-full').style.display = 'inline'; document.getElementById('2305.04476v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.04476v4-abstract-full" style="display: none;"> The speech-to-singing (STS) voice conversion task aims to generate singing samples corresponding to speech recordings while facing a major challenge: the alignment between the target (singing) pitch contour and the source (speech) content is difficult to learn in a text-free situation. This paper proposes AlignSTS, an STS model based on explicit cross-modal alignment, which views speech variance such as pitch and content as different modalities. Inspired by the mechanism of how humans will sing the lyrics to the melody, AlignSTS: 1) adopts a novel rhythm adaptor to predict the target rhythm representation to bridge the modality gap between content and pitch, where the rhythm representation is computed in a simple yet effective way and is quantized into a discrete space; and 2) uses the predicted rhythm representation to re-align the content based on cross-attention and conducts a cross-modal fusion for re-synthesize. Extensive experiments show that AlignSTS achieves superior performance in terms of both objective and subjective metrics. Audio samples are available at https://alignsts.github.io. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.04476v4-abstract-full').style.display = 'none'; document.getElementById('2305.04476v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Findings of ACL 2023</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Huang%2C+R&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Huang%2C+R&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Huang%2C+R&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Huang%2C+R&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository