Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 94 results for author: <span class="mathjax">Ye, Z</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/eess" aria-role="search"> Searching in archive <strong>eess</strong>. <a href="/search/?searchtype=author&query=Ye%2C+Z">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Ye, Z"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Ye%2C+Z&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Ye, Z"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Ye%2C+Z&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Ye%2C+Z&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Ye%2C+Z&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.08638">arXiv:2503.08638</a> <span> [<a href="https://arxiv.org/pdf/2503.08638">pdf</a>, <a href="https://arxiv.org/format/2503.08638">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> YuE: Scaling Open Foundation Models for Long-Form Music Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yuan%2C+R">Ruibin Yuan</a>, <a href="/search/eess?searchtype=author&query=Lin%2C+H">Hanfeng Lin</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+S">Shuyue Guo</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+G">Ge Zhang</a>, <a href="/search/eess?searchtype=author&query=Pan%2C+J">Jiahao Pan</a>, <a href="/search/eess?searchtype=author&query=Zang%2C+Y">Yongyi Zang</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+H">Haohe Liu</a>, <a href="/search/eess?searchtype=author&query=Liang%2C+Y">Yiming Liang</a>, <a href="/search/eess?searchtype=author&query=Ma%2C+W">Wenye Ma</a>, <a href="/search/eess?searchtype=author&query=Du%2C+X">Xingjian Du</a>, <a href="/search/eess?searchtype=author&query=Du%2C+X">Xinrun Du</a>, <a href="/search/eess?searchtype=author&query=Ye%2C+Z">Zhen Ye</a>, <a href="/search/eess?searchtype=author&query=Zheng%2C+T">Tianyu Zheng</a>, <a href="/search/eess?searchtype=author&query=Ma%2C+Y">Yinghao Ma</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+M">Minghao Liu</a>, <a href="/search/eess?searchtype=author&query=Tian%2C+Z">Zeyue Tian</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+Z">Ziya Zhou</a>, <a href="/search/eess?searchtype=author&query=Xue%2C+L">Liumeng Xue</a>, <a href="/search/eess?searchtype=author&query=Qu%2C+X">Xingwei Qu</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yizhi Li</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+S">Shangda Wu</a>, <a href="/search/eess?searchtype=author&query=Shen%2C+T">Tianhao Shen</a>, <a href="/search/eess?searchtype=author&query=Ma%2C+Z">Ziyang Ma</a>, <a href="/search/eess?searchtype=author&query=Zhan%2C+J">Jun Zhan</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+C">Chunhui Wang</a> , et al. (32 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.08638v1-abstract-short" style="display: inline;"> We tackle the task of long-form music generation--particularly the challenging \textbf{lyrics-to-song} problem--by introducing YuE, a family of open foundation models based on the LLaMA2 architecture. Specifically, YuE scales to trillions of tokens and generates up to five minutes of music while maintaining lyrical alignment, coherent musical structure, and engaging vocal melodies with appropriate… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.08638v1-abstract-full').style.display = 'inline'; document.getElementById('2503.08638v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.08638v1-abstract-full" style="display: none;"> We tackle the task of long-form music generation--particularly the challenging \textbf{lyrics-to-song} problem--by introducing YuE, a family of open foundation models based on the LLaMA2 architecture. Specifically, YuE scales to trillions of tokens and generates up to five minutes of music while maintaining lyrical alignment, coherent musical structure, and engaging vocal melodies with appropriate accompaniment. It achieves this through (1) track-decoupled next-token prediction to overcome dense mixture signals, (2) structural progressive conditioning for long-context lyrical alignment, and (3) a multitask, multiphase pre-training recipe to converge and generalize. In addition, we redesign the in-context learning technique for music generation, enabling versatile style transfer (e.g., converting Japanese city pop into an English rap while preserving the original accompaniment) and bidirectional generation. Through extensive evaluation, we demonstrate that YuE matches or even surpasses some of the proprietary systems in musicality and vocal agility. In addition, fine-tuning YuE enables additional controls and enhanced support for tail languages. Furthermore, beyond generation, we show that YuE's learned representations can perform well on music understanding tasks, where the results of YuE match or exceed state-of-the-art methods on the MARBLE benchmark. Keywords: lyrics2song, song generation, long-form, foundation model, music generation <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.08638v1-abstract-full').style.display = 'none'; document.getElementById('2503.08638v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">https://github.com/multimodal-art-projection/YuE</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.03786">arXiv:2503.03786</a> <span> [<a href="https://arxiv.org/pdf/2503.03786">pdf</a>, <a href="https://arxiv.org/format/2503.03786">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Tissues and Organs">q-bio.TO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Self is the Best Learner: CT-free Ultra-Low-Dose PET Organ Segmentation via Collaborating Denoising and Segmentation Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ye%2C+Z">Zanting Ye</a>, <a href="/search/eess?searchtype=author&query=Niu%2C+X">Xiaolong Niu</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+X">Xuanbin Wu</a>, <a href="/search/eess?searchtype=author&query=Lu%2C+W">Wantong Lu</a>, <a href="/search/eess?searchtype=author&query=Lu%2C+L">Lijun Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.03786v1-abstract-short" style="display: inline;"> Organ segmentation in Positron Emission Tomography (PET) plays a vital role in cancer quantification. Low-dose PET (LDPET) provides a safer alternative by reducing radiation exposure. However, the inherent noise and blurred boundaries make organ segmentation more challenging. Additionally, existing PET organ segmentation methods rely on co-registered Computed Tomography (CT) annotations, overlooki… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.03786v1-abstract-full').style.display = 'inline'; document.getElementById('2503.03786v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.03786v1-abstract-full" style="display: none;"> Organ segmentation in Positron Emission Tomography (PET) plays a vital role in cancer quantification. Low-dose PET (LDPET) provides a safer alternative by reducing radiation exposure. However, the inherent noise and blurred boundaries make organ segmentation more challenging. Additionally, existing PET organ segmentation methods rely on co-registered Computed Tomography (CT) annotations, overlooking the problem of modality mismatch. In this study, we propose LDOS, a novel CT-free ultra-LDPET organ segmentation pipeline. Inspired by Masked Autoencoders (MAE), we reinterpret LDPET as a naturally masked version of Full-Dose PET (FDPET). LDOS adopts a simple yet effective architecture: a shared encoder extracts generalized features, while task-specific decoders independently refine outputs for denoising and segmentation. By integrating CT-derived organ annotations into the denoising process, LDOS improves anatomical boundary recognition and alleviates the PET/CT misalignments. Experiments demonstrate that LDOS achieves state-of-the-art performance with mean Dice scores of 73.11% (18F-FDG) and 73.97% (68Ga-FAPI) across 18 organs in 5% dose PET. Our code is publicly available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.03786v1-abstract-full').style.display = 'none'; document.getElementById('2503.03786v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.01710">arXiv:2503.01710</a> <span> [<a href="https://arxiv.org/pdf/2503.01710">pdf</a>, <a href="https://arxiv.org/format/2503.01710">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Spark-TTS: An Efficient LLM-Based Text-to-Speech Model with Single-Stream Decoupled Speech Tokens </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+X">Xinsheng Wang</a>, <a href="/search/eess?searchtype=author&query=Jiang%2C+M">Mingqi Jiang</a>, <a href="/search/eess?searchtype=author&query=Ma%2C+Z">Ziyang Ma</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Z">Ziyu Zhang</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+S">Songxiang Liu</a>, <a href="/search/eess?searchtype=author&query=Li%2C+L">Linqin Li</a>, <a href="/search/eess?searchtype=author&query=Liang%2C+Z">Zheng Liang</a>, <a href="/search/eess?searchtype=author&query=Zheng%2C+Q">Qixi Zheng</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+R">Rui Wang</a>, <a href="/search/eess?searchtype=author&query=Feng%2C+X">Xiaoqin Feng</a>, <a href="/search/eess?searchtype=author&query=Bian%2C+W">Weizhen Bian</a>, <a href="/search/eess?searchtype=author&query=Ye%2C+Z">Zhen Ye</a>, <a href="/search/eess?searchtype=author&query=Cheng%2C+S">Sitong Cheng</a>, <a href="/search/eess?searchtype=author&query=Yuan%2C+R">Ruibin Yuan</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+Z">Zhixian Zhao</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+X">Xinfa Zhu</a>, <a href="/search/eess?searchtype=author&query=Pan%2C+J">Jiahao Pan</a>, <a href="/search/eess?searchtype=author&query=Xue%2C+L">Liumeng Xue</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+P">Pengcheng Zhu</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yunlin Chen</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Z">Zhifei Li</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+X">Xie Chen</a>, <a href="/search/eess?searchtype=author&query=Xie%2C+L">Lei Xie</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+Y">Yike Guo</a>, <a href="/search/eess?searchtype=author&query=Xue%2C+W">Wei Xue</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.01710v1-abstract-short" style="display: inline;"> Recent advancements in large language models (LLMs) have driven significant progress in zero-shot text-to-speech (TTS) synthesis. However, existing foundation models rely on multi-stage processing or complex architectures for predicting multiple codebooks, limiting efficiency and integration flexibility. To overcome these challenges, we introduce Spark-TTS, a novel system powered by BiCodec, a sin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.01710v1-abstract-full').style.display = 'inline'; document.getElementById('2503.01710v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.01710v1-abstract-full" style="display: none;"> Recent advancements in large language models (LLMs) have driven significant progress in zero-shot text-to-speech (TTS) synthesis. However, existing foundation models rely on multi-stage processing or complex architectures for predicting multiple codebooks, limiting efficiency and integration flexibility. To overcome these challenges, we introduce Spark-TTS, a novel system powered by BiCodec, a single-stream speech codec that decomposes speech into two complementary token types: low-bitrate semantic tokens for linguistic content and fixed-length global tokens for speaker attributes. This disentangled representation, combined with the Qwen2.5 LLM and a chain-of-thought (CoT) generation approach, enables both coarse-grained control (e.g., gender, speaking style) and fine-grained adjustments (e.g., precise pitch values, speaking rate). To facilitate research in controllable TTS, we introduce VoxBox, a meticulously curated 100,000-hour dataset with comprehensive attribute annotations. Extensive experiments demonstrate that Spark-TTS not only achieves state-of-the-art zero-shot voice cloning but also generates highly customizable voices that surpass the limitations of reference-based synthesis. Source code, pre-trained models, and audio samples are available at https://github.com/SparkAudio/Spark-TTS. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.01710v1-abstract-full').style.display = 'none'; document.getElementById('2503.01710v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to ACL 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.00493">arXiv:2503.00493</a> <span> [<a href="https://arxiv.org/pdf/2503.00493">pdf</a>, <a href="https://arxiv.org/format/2503.00493">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> LLaSE-G1: Incentivizing Generalization Capability for LLaMA-based Speech Enhancement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Kang%2C+B">Boyi Kang</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+X">Xinfa Zhu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Z">Zihan Zhang</a>, <a href="/search/eess?searchtype=author&query=Ye%2C+Z">Zhen Ye</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+M">Mingshuai Liu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Z">Ziqian Wang</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+Y">Yike Zhu</a>, <a href="/search/eess?searchtype=author&query=Ma%2C+G">Guobin Ma</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+J">Jun Chen</a>, <a href="/search/eess?searchtype=author&query=Xiao%2C+L">Longshuai Xiao</a>, <a href="/search/eess?searchtype=author&query=Weng%2C+C">Chao Weng</a>, <a href="/search/eess?searchtype=author&query=Xue%2C+W">Wei Xue</a>, <a href="/search/eess?searchtype=author&query=Xie%2C+L">Lei Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.00493v2-abstract-short" style="display: inline;"> Recent advancements in language models (LMs) have demonstrated strong capabilities in semantic understanding and contextual modeling, which have flourished in generative speech enhancement (SE). However, many LM-based SE approaches primarily focus on semantic information, often neglecting the critical role of acoustic information, which leads to acoustic inconsistency after enhancement and limited… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.00493v2-abstract-full').style.display = 'inline'; document.getElementById('2503.00493v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.00493v2-abstract-full" style="display: none;"> Recent advancements in language models (LMs) have demonstrated strong capabilities in semantic understanding and contextual modeling, which have flourished in generative speech enhancement (SE). However, many LM-based SE approaches primarily focus on semantic information, often neglecting the critical role of acoustic information, which leads to acoustic inconsistency after enhancement and limited generalization across diverse SE tasks. In this paper, we introduce LLaSE-G1, a LLaMA-based language model that incentivizes generalization capabilities for speech enhancement. LLaSE-G1 offers the following key contributions: First, to mitigate acoustic inconsistency, LLaSE-G1 employs continuous representations from WavLM as input and predicts speech tokens from X-Codec2, maximizing acoustic preservation. Second, to promote generalization capability, LLaSE-G1 introduces dual-channel inputs and outputs, unifying multiple SE tasks without requiring task-specific IDs. Third, LLaSE-G1 outperforms prior task-specific discriminative and generative SE models, demonstrating scaling effects at test time and emerging capabilities for unseen SE tasks. Additionally, we release our code and models to support further research in this area. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.00493v2-abstract-full').style.display = 'none'; document.getElementById('2503.00493v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 2 figures, 8 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.18924">arXiv:2502.18924</a> <span> [<a href="https://arxiv.org/pdf/2502.18924">pdf</a>, <a href="https://arxiv.org/format/2502.18924">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Sparse Alignment Enhanced Latent Diffusion Transformer for Zero-Shot Speech Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Jiang%2C+Z">Ziyue Jiang</a>, <a href="/search/eess?searchtype=author&query=Ren%2C+Y">Yi Ren</a>, <a href="/search/eess?searchtype=author&query=Li%2C+R">Ruiqi Li</a>, <a href="/search/eess?searchtype=author&query=Ji%2C+S">Shengpeng Ji</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+B">Boyang Zhang</a>, <a href="/search/eess?searchtype=author&query=Ye%2C+Z">Zhenhui Ye</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+C">Chen Zhang</a>, <a href="/search/eess?searchtype=author&query=Jionghao%2C+B">Bai Jionghao</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+X">Xiaoda Yang</a>, <a href="/search/eess?searchtype=author&query=Zuo%2C+J">Jialong Zuo</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Y">Yu Zhang</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+R">Rui Liu</a>, <a href="/search/eess?searchtype=author&query=Yin%2C+X">Xiang Yin</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+Z">Zhou Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.18924v2-abstract-short" style="display: inline;"> While recent zero-shot text-to-speech (TTS) models have significantly improved speech quality and expressiveness, mainstream systems still suffer from issues related to speech-text alignment modeling: 1) models without explicit speech-text alignment modeling exhibit less robustness, especially for hard sentences in practical applications; 2) predefined alignment-based models suffer from naturalnes… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.18924v2-abstract-full').style.display = 'inline'; document.getElementById('2502.18924v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.18924v2-abstract-full" style="display: none;"> While recent zero-shot text-to-speech (TTS) models have significantly improved speech quality and expressiveness, mainstream systems still suffer from issues related to speech-text alignment modeling: 1) models without explicit speech-text alignment modeling exhibit less robustness, especially for hard sentences in practical applications; 2) predefined alignment-based models suffer from naturalness constraints of forced alignments. This paper introduces \textit{S-DiT}, a TTS system featuring an innovative sparse alignment algorithm that guides the latent diffusion transformer (DiT). Specifically, we provide sparse alignment boundaries to S-DiT to reduce the difficulty of alignment learning without limiting the search space, thereby achieving high naturalness. Moreover, we employ a multi-condition classifier-free guidance strategy for accent intensity adjustment and adopt the piecewise rectified flow technique to accelerate the generation process. Experiments demonstrate that S-DiT achieves state-of-the-art zero-shot TTS speech quality and supports highly flexible control over accent intensity. Notably, our system can generate high-quality one-minute speech with only 8 sampling steps. Audio samples are available at https://sditdemo.github.io/sditdemo/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.18924v2-abstract-full').style.display = 'none'; document.getElementById('2502.18924v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.04128">arXiv:2502.04128</a> <span> [<a href="https://arxiv.org/pdf/2502.04128">pdf</a>, <a href="https://arxiv.org/format/2502.04128">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Llasa: Scaling Train-Time and Inference-Time Compute for Llama-based Speech Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ye%2C+Z">Zhen Ye</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+X">Xinfa Zhu</a>, <a href="/search/eess?searchtype=author&query=Chan%2C+C">Chi-Min Chan</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+X">Xinsheng Wang</a>, <a href="/search/eess?searchtype=author&query=Tan%2C+X">Xu Tan</a>, <a href="/search/eess?searchtype=author&query=Lei%2C+J">Jiahe Lei</a>, <a href="/search/eess?searchtype=author&query=Peng%2C+Y">Yi Peng</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+H">Haohe Liu</a>, <a href="/search/eess?searchtype=author&query=Jin%2C+Y">Yizhu Jin</a>, <a href="/search/eess?searchtype=author&query=Dai%2C+Z">Zheqi Dai</a>, <a href="/search/eess?searchtype=author&query=Lin%2C+H">Hongzhan Lin</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+J">Jianyi Chen</a>, <a href="/search/eess?searchtype=author&query=Du%2C+X">Xingjian Du</a>, <a href="/search/eess?searchtype=author&query=Xue%2C+L">Liumeng Xue</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yunlin Chen</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Z">Zhifei Li</a>, <a href="/search/eess?searchtype=author&query=Xie%2C+L">Lei Xie</a>, <a href="/search/eess?searchtype=author&query=Kong%2C+Q">Qiuqiang Kong</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+Y">Yike Guo</a>, <a href="/search/eess?searchtype=author&query=Xue%2C+W">Wei Xue</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.04128v2-abstract-short" style="display: inline;"> Recent advances in text-based large language models (LLMs), particularly in the GPT series and the o1 model, have demonstrated the effectiveness of scaling both training-time and inference-time compute. However, current state-of-the-art TTS systems leveraging LLMs are often multi-stage, requiring separate models (e.g., diffusion models after LLM), complicating the decision of whether to scale a pa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04128v2-abstract-full').style.display = 'inline'; document.getElementById('2502.04128v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.04128v2-abstract-full" style="display: none;"> Recent advances in text-based large language models (LLMs), particularly in the GPT series and the o1 model, have demonstrated the effectiveness of scaling both training-time and inference-time compute. However, current state-of-the-art TTS systems leveraging LLMs are often multi-stage, requiring separate models (e.g., diffusion models after LLM), complicating the decision of whether to scale a particular model during training or testing. This work makes the following contributions: First, we explore the scaling of train-time and inference-time compute for speech synthesis. Second, we propose a simple framework Llasa for speech synthesis that employs a single-layer vector quantizer (VQ) codec and a single Transformer architecture to fully align with standard LLMs such as Llama. Our experiments reveal that scaling train-time compute for Llasa consistently improves the naturalness of synthesized speech and enables the generation of more complex and accurate prosody patterns. Furthermore, from the perspective of scaling inference-time compute, we employ speech understanding models as verifiers during the search, finding that scaling inference-time compute shifts the sampling modes toward the preferences of specific verifiers, thereby improving emotional expressiveness, timbre consistency, and content accuracy. In addition, we released the checkpoint and training code for our TTS model (1B, 3B, 8B) and codec model publicly available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04128v2-abstract-full').style.display = 'none'; document.getElementById('2502.04128v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.18203">arXiv:2501.18203</a> <span> [<a href="https://arxiv.org/pdf/2501.18203">pdf</a>, <a href="https://arxiv.org/format/2501.18203">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Joint Design and Pricing of Extended Warranties for Multiple Automobiles with Different Price Bands </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yajing Chen</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yanrong Li</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+X">Xiao-Lin Wang</a>, <a href="/search/eess?searchtype=author&query=Ye%2C+Z">Zhi-Sheng Ye</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.18203v1-abstract-short" style="display: inline;"> Extended warranties (EWs) are significant source of revenue for capital-intensive products like automobiles. Such products consist of multiple subsystems, providing flexibility in EW customization, for example, bundling a tailored set of subsystems in an EW contract. This, in turn, enables the creation of a service menu with different EW contract options. From the perspective of a third-party EW p… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18203v1-abstract-full').style.display = 'inline'; document.getElementById('2501.18203v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.18203v1-abstract-full" style="display: none;"> Extended warranties (EWs) are significant source of revenue for capital-intensive products like automobiles. Such products consist of multiple subsystems, providing flexibility in EW customization, for example, bundling a tailored set of subsystems in an EW contract. This, in turn, enables the creation of a service menu with different EW contract options. From the perspective of a third-party EW provider servicing a fleet of automobile brands, we develop a novel model to jointly optimize the design and pricing of EWs in order to maximize the profit. Specifically, the problem is to determine which contracts should be included in the EW menu and identify the appropriate price for each contract. As the complexity of the joint optimization problem increases exponentially with the number of subsystems, two solution approaches are devised to solve the problem. The first approach is based on a mixed-integer second-order cone programming reformulation, which guarantees optimality but is applicable only for a small number of subsystems. The second approach utilizes a two-step iteration process, offering enhanced computational efficiency in scenarios with a large number of subsystems. Through numerical experiments, the effectiveness of our model is validated, particularly in scenarios characterized by high failure rates and a large number of subsystems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18203v1-abstract-full').style.display = 'none'; document.getElementById('2501.18203v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.16729">arXiv:2411.16729</a> <span> [<a href="https://arxiv.org/pdf/2411.16729">pdf</a>, <a href="https://arxiv.org/format/2411.16729">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> DiM-Gestor: Co-Speech Gesture Generation with Adaptive Layer Normalization Mamba-2 </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+F">Fan Zhang</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+S">Siyuan Zhao</a>, <a href="/search/eess?searchtype=author&query=Ji%2C+N">Naye Ji</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Z">Zhaohan Wang</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+J">Jingmei Wu</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+F">Fuxing Gao</a>, <a href="/search/eess?searchtype=author&query=Ye%2C+Z">Zhenqing Ye</a>, <a href="/search/eess?searchtype=author&query=Yan%2C+L">Leyao Yan</a>, <a href="/search/eess?searchtype=author&query=Dai%2C+L">Lanxin Dai</a>, <a href="/search/eess?searchtype=author&query=Geng%2C+W">Weidong Geng</a>, <a href="/search/eess?searchtype=author&query=Lyu%2C+X">Xin Lyu</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+B">Bozuo Zhao</a>, <a href="/search/eess?searchtype=author&query=Yu%2C+D">Dingguo Yu</a>, <a href="/search/eess?searchtype=author&query=Du%2C+H">Hui Du</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+B">Bin Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.16729v1-abstract-short" style="display: inline;"> Speech-driven gesture generation using transformer-based generative models represents a rapidly advancing area within virtual human creation. However, existing models face significant challenges due to their quadratic time and space complexities, limiting scalability and efficiency. To address these limitations, we introduce DiM-Gestor, an innovative end-to-end generative model leveraging the Mamb… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16729v1-abstract-full').style.display = 'inline'; document.getElementById('2411.16729v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.16729v1-abstract-full" style="display: none;"> Speech-driven gesture generation using transformer-based generative models represents a rapidly advancing area within virtual human creation. However, existing models face significant challenges due to their quadratic time and space complexities, limiting scalability and efficiency. To address these limitations, we introduce DiM-Gestor, an innovative end-to-end generative model leveraging the Mamba-2 architecture. DiM-Gestor features a dual-component framework: (1) a fuzzy feature extractor and (2) a speech-to-gesture mapping module, both built on the Mamba-2. The fuzzy feature extractor, integrated with a Chinese Pre-trained Model and Mamba-2, autonomously extracts implicit, continuous speech features. These features are synthesized into a unified latent representation and then processed by the speech-to-gesture mapping module. This module employs an Adaptive Layer Normalization (AdaLN)-enhanced Mamba-2 mechanism to uniformly apply transformations across all sequence tokens. This enables precise modeling of the nuanced interplay between speech features and gesture dynamics. We utilize a diffusion model to train and infer diverse gesture outputs. Extensive subjective and objective evaluations conducted on the newly released Chinese Co-Speech Gestures dataset corroborate the efficacy of our proposed model. Compared with Transformer-based architecture, the assessments reveal that our approach delivers competitive results and significantly reduces memory usage, approximately 2.4 times, and enhances inference speeds by 2 to 4 times. Additionally, we released the CCG dataset, a Chinese Co-Speech Gestures dataset, comprising 15.97 hours (six styles across five scenarios) of 3D full-body skeleton gesture motion performed by professional Chinese TV broadcasters. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16729v1-abstract-full').style.display = 'none'; document.getElementById('2411.16729v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 11 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.22646">arXiv:2410.22646</a> <span> [<a href="https://arxiv.org/pdf/2410.22646">pdf</a>, <a href="https://arxiv.org/format/2410.22646">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> SleepNetZero: Zero-Burden Zero-Shot Reliable Sleep Staging With Neural Networks Based on Ballistocardiograms </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+S">Shuzhen Li</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yuxin Chen</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+X">Xuesong Chen</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+R">Ruiyang Gao</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Y">Yupeng Zhang</a>, <a href="/search/eess?searchtype=author&query=Yu%2C+C">Chao Yu</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yunfei Li</a>, <a href="/search/eess?searchtype=author&query=Ye%2C+Z">Ziyi Ye</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+W">Weijun Huang</a>, <a href="/search/eess?searchtype=author&query=Yi%2C+H">Hongliang Yi</a>, <a href="/search/eess?searchtype=author&query=Leng%2C+Y">Yue Leng</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+Y">Yi Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.22646v1-abstract-short" style="display: inline;"> Sleep monitoring plays a crucial role in maintaining good health, with sleep staging serving as an essential metric in the monitoring process. Traditional methods, utilizing medical sensors like EEG and ECG, can be effective but often present challenges such as unnatural user experience, complex deployment, and high costs. Ballistocardiography~(BCG), a type of piezoelectric sensor signal, offers a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22646v1-abstract-full').style.display = 'inline'; document.getElementById('2410.22646v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.22646v1-abstract-full" style="display: none;"> Sleep monitoring plays a crucial role in maintaining good health, with sleep staging serving as an essential metric in the monitoring process. Traditional methods, utilizing medical sensors like EEG and ECG, can be effective but often present challenges such as unnatural user experience, complex deployment, and high costs. Ballistocardiography~(BCG), a type of piezoelectric sensor signal, offers a non-invasive, user-friendly, and easily deployable alternative for long-term home monitoring. However, reliable BCG-based sleep staging is challenging due to the limited sleep monitoring data available for BCG. A restricted training dataset prevents the model from generalization across populations. Additionally, transferring to BCG faces difficulty ensuring model robustness when migrating from other data sources. To address these issues, we introduce SleepNetZero, a zero-shot learning based approach for sleep staging. To tackle the generalization challenge, we propose a series of BCG feature extraction methods that align BCG components with corresponding respiratory, cardiac, and movement channels in PSG. This allows models to be trained on large-scale PSG datasets that are diverse in population. For the migration challenge, we employ data augmentation techniques, significantly enhancing generalizability. We conducted extensive training and testing on large datasets~(12393 records from 9637 different subjects), achieving an accuracy of 0.803 and a Cohen's Kappa of 0.718. ZeroSleepNet was also deployed in real prototype~(monitoring pads) and tested in actual hospital settings~(265 users), demonstrating an accuracy of 0.697 and a Cohen's Kappa of 0.589. To the best of our knowledge, this work represents the first known reliable BCG-based sleep staging effort and marks a significant step towards in-home health monitoring. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22646v1-abstract-full').style.display = 'none'; document.getElementById('2410.22646v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">25 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.22066">arXiv:2410.22066</a> <span> [<a href="https://arxiv.org/pdf/2410.22066">pdf</a>, <a href="https://arxiv.org/format/2410.22066">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Sing it, Narrate it: Quality Musical Lyrics Translation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ye%2C+Z">Zhuorui Ye</a>, <a href="/search/eess?searchtype=author&query=Li%2C+J">Jinhan Li</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+R">Rongwu Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.22066v1-abstract-short" style="display: inline;"> Translating lyrics for musicals presents unique challenges due to the need to ensure high translation quality while adhering to singability requirements such as length and rhyme. Existing song translation approaches often prioritize these singability constraints at the expense of translation quality, which is crucial for musicals. This paper aims to enhance translation quality while maintaining ke… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22066v1-abstract-full').style.display = 'inline'; document.getElementById('2410.22066v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.22066v1-abstract-full" style="display: none;"> Translating lyrics for musicals presents unique challenges due to the need to ensure high translation quality while adhering to singability requirements such as length and rhyme. Existing song translation approaches often prioritize these singability constraints at the expense of translation quality, which is crucial for musicals. This paper aims to enhance translation quality while maintaining key singability features. Our method consists of three main components. First, we create a dataset to train reward models for the automatic evaluation of translation quality. Second, to enhance both singability and translation quality, we implement a two-stage training process with filtering techniques. Finally, we introduce an inference-time optimization framework for translating entire songs. Extensive experiments, including both automatic and human evaluations, demonstrate significant improvements over baseline methods and validate the effectiveness of each component in our approach. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22066v1-abstract-full').style.display = 'none'; document.getElementById('2410.22066v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.11002">arXiv:2410.11002</a> <span> [<a href="https://arxiv.org/pdf/2410.11002">pdf</a>, <a href="https://arxiv.org/format/2410.11002">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Optimizing Radio Access Technology Selection and Precoding in CV-Aided ISAC Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Gao%2C+Y">Yulan Gao</a>, <a href="/search/eess?searchtype=author&query=Ye%2C+Z">Ziqiang Ye</a>, <a href="/search/eess?searchtype=author&query=Xiao%2C+M">Ming Xiao</a>, <a href="/search/eess?searchtype=author&query=Xiao%2C+Y">Yue Xiao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.11002v1-abstract-short" style="display: inline;"> Integrated Sensing and Communication (ISAC) systems promise to revolutionize wireless networks by concurrently supporting high-resolution sensing and high-performance communication. This paper presents a novel radio access technology (RAT) selection framework that capitalizes on vision sensing from base station (BS) cameras to optimize both communication and perception capabilities within the ISAC… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11002v1-abstract-full').style.display = 'inline'; document.getElementById('2410.11002v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.11002v1-abstract-full" style="display: none;"> Integrated Sensing and Communication (ISAC) systems promise to revolutionize wireless networks by concurrently supporting high-resolution sensing and high-performance communication. This paper presents a novel radio access technology (RAT) selection framework that capitalizes on vision sensing from base station (BS) cameras to optimize both communication and perception capabilities within the ISAC system. Our framework strategically employs two distinct RATs, LTE and millimeter wave (mmWave), to enhance system performance. We propose a vision-based user localization method that employs a 3D detection technique to capture the spatial distribution of users within the surrounding environment. This is followed by geometric calculations to accurately determine the state of mmWave communication links between the BS and individual users. Additionally, we integrate the SlowFast model to recognize user activities, facilitating adaptive transmission rate allocation based on observed behaviors. We develop a Deep Deterministic Policy Gradient (DDPG)-based algorithm, utilizing the joint distribution of users and their activities, designed to maximize the total transmission rate for all users through joint RAT selection and precoding optimization, while adhering to constraints on sensing mutual information and minimum transmission rates. Numerical simulation results demonstrate the effectiveness of the proposed framework in dynamically adjusting resource allocation, ensuring high-quality communication under challenging conditions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11002v1-abstract-full').style.display = 'none'; document.getElementById('2410.11002v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.10676">arXiv:2410.10676</a> <span> [<a href="https://arxiv.org/pdf/2410.10676">pdf</a>, <a href="https://arxiv.org/format/2410.10676">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Both Ears Wide Open: Towards Language-Driven Spatial Audio Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Sun%2C+P">Peiwen Sun</a>, <a href="/search/eess?searchtype=author&query=Cheng%2C+S">Sitong Cheng</a>, <a href="/search/eess?searchtype=author&query=Li%2C+X">Xiangtai Li</a>, <a href="/search/eess?searchtype=author&query=Ye%2C+Z">Zhen Ye</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+H">Huadai Liu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+H">Honggang Zhang</a>, <a href="/search/eess?searchtype=author&query=Xue%2C+W">Wei Xue</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+Y">Yike Guo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.10676v2-abstract-short" style="display: inline;"> Recently, diffusion models have achieved great success in mono-channel audio generation. However, when it comes to stereo audio generation, the soundscapes often have a complex scene of multiple objects and directions. Controlling stereo audio with spatial contexts remains challenging due to high data costs and unstable generative models. To the best of our knowledge, this work represents the firs… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10676v2-abstract-full').style.display = 'inline'; document.getElementById('2410.10676v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.10676v2-abstract-full" style="display: none;"> Recently, diffusion models have achieved great success in mono-channel audio generation. However, when it comes to stereo audio generation, the soundscapes often have a complex scene of multiple objects and directions. Controlling stereo audio with spatial contexts remains challenging due to high data costs and unstable generative models. To the best of our knowledge, this work represents the first attempt to address these issues. We first construct a large-scale, simulation-based, and GPT-assisted dataset, BEWO-1M, with abundant soundscapes and descriptions even including moving and multiple sources. Beyond text modality, we have also acquired a set of images and rationally paired stereo audios through retrieval to advance multimodal generation. Existing audio generation models tend to generate rather random and indistinct spatial audio. To provide accurate guidance for Latent Diffusion Models, we introduce the SpatialSonic model utilizing spatial-aware encoders and azimuth state matrices to reveal reasonable spatial guidance. By leveraging spatial guidance, our model not only achieves the objective of generating immersive and controllable spatial audio from text but also extends to other modalities as the pioneer attempt. Finally, under fair settings, we conduct subjective and objective evaluations on simulated and real-world data to compare our approach with prevailing methods. The results demonstrate the effectiveness of our method, highlighting its capability to generate spatial audio that adheres to physical rules. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10676v2-abstract-full').style.display = 'none'; document.getElementById('2410.10676v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICLR 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.13698">arXiv:2409.13698</a> <span> [<a href="https://arxiv.org/pdf/2409.13698">pdf</a>, <a href="https://arxiv.org/format/2409.13698">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.21437/Interspeech.2024-768">10.21437/Interspeech.2024-768 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Lightweight Transducer Based on Frame-Level Criterion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wan%2C+G">Genshun Wan</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+M">Mengzhi Wang</a>, <a href="/search/eess?searchtype=author&query=Mao%2C+T">Tingzhi Mao</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+H">Hang Chen</a>, <a href="/search/eess?searchtype=author&query=Ye%2C+Z">Zhongfu Ye</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.13698v2-abstract-short" style="display: inline;"> The transducer model trained based on sequence-level criterion requires a lot of memory due to the generation of the large probability matrix. We proposed a lightweight transducer model based on frame-level criterion, which uses the results of the CTC forced alignment algorithm to determine the label for each frame. Then the encoder output can be combined with the decoder output at the correspondi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13698v2-abstract-full').style.display = 'inline'; document.getElementById('2409.13698v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.13698v2-abstract-full" style="display: none;"> The transducer model trained based on sequence-level criterion requires a lot of memory due to the generation of the large probability matrix. We proposed a lightweight transducer model based on frame-level criterion, which uses the results of the CTC forced alignment algorithm to determine the label for each frame. Then the encoder output can be combined with the decoder output at the corresponding time, rather than adding each element output by the encoder to each element output by the decoder as in the transducer. This significantly reduces memory and computation requirements. To address the problem of imbalanced classification caused by excessive blanks in the label, we decouple the blank and non-blank probabilities and truncate the gradient of the blank classifier to the main network. Experiments on the AISHELL-1 demonstrate that this enables the lightweight transducer to achieve similar results to transducer. Additionally, we use richer information to predict the probability of blank, achieving superior results to transducer. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13698v2-abstract-full').style.display = 'none'; document.getElementById('2409.13698v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by Interspeech 2024, code repository: https://github.com/wangmengzhi/Lightweight-Transducer</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Proc. Interspeech 2024, 247-251 (2024) </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.13582">arXiv:2409.13582</a> <span> [<a href="https://arxiv.org/pdf/2409.13582">pdf</a>, <a href="https://arxiv.org/format/2409.13582">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Time and Tokens: Benchmarking End-to-End Speech Dysfluency Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhou%2C+X">Xuanru Zhou</a>, <a href="/search/eess?searchtype=author&query=Lian%2C+J">Jiachen Lian</a>, <a href="/search/eess?searchtype=author&query=Cho%2C+C+J">Cheol Jun Cho</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+J">Jingwen Liu</a>, <a href="/search/eess?searchtype=author&query=Ye%2C+Z">Zongli Ye</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+J">Jinming Zhang</a>, <a href="/search/eess?searchtype=author&query=Morin%2C+B">Brittany Morin</a>, <a href="/search/eess?searchtype=author&query=Baquirin%2C+D">David Baquirin</a>, <a href="/search/eess?searchtype=author&query=Vonk%2C+J">Jet Vonk</a>, <a href="/search/eess?searchtype=author&query=Ezzes%2C+Z">Zoe Ezzes</a>, <a href="/search/eess?searchtype=author&query=Miller%2C+Z">Zachary Miller</a>, <a href="/search/eess?searchtype=author&query=Tempini%2C+M+L+G">Maria Luisa Gorno Tempini</a>, <a href="/search/eess?searchtype=author&query=Anumanchipalli%2C+G">Gopala Anumanchipalli</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.13582v1-abstract-short" style="display: inline;"> Speech dysfluency modeling is a task to detect dysfluencies in speech, such as repetition, block, insertion, replacement, and deletion. Most recent advancements treat this problem as a time-based object detection problem. In this work, we revisit this problem from a new perspective: tokenizing dysfluencies and modeling the detection problem as a token-based automatic speech recognition (ASR) probl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13582v1-abstract-full').style.display = 'inline'; document.getElementById('2409.13582v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.13582v1-abstract-full" style="display: none;"> Speech dysfluency modeling is a task to detect dysfluencies in speech, such as repetition, block, insertion, replacement, and deletion. Most recent advancements treat this problem as a time-based object detection problem. In this work, we revisit this problem from a new perspective: tokenizing dysfluencies and modeling the detection problem as a token-based automatic speech recognition (ASR) problem. We propose rule-based speech and text dysfluency simulators and develop VCTK-token, and then develop a Whisper-like seq2seq architecture to build a new benchmark with decent performance. We also systematically compare our proposed token-based methods with time-based methods, and propose a unified benchmark to facilitate future research endeavors. We open-source these resources for the broader scientific community. The project page is available at https://rorizzz.github.io/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13582v1-abstract-full').style.display = 'none'; document.getElementById('2409.13582v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.17175">arXiv:2408.17175</a> <span> [<a href="https://arxiv.org/pdf/2408.17175">pdf</a>, <a href="https://arxiv.org/format/2408.17175">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Codec Does Matter: Exploring the Semantic Shortcoming of Codec for Audio Language Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ye%2C+Z">Zhen Ye</a>, <a href="/search/eess?searchtype=author&query=Sun%2C+P">Peiwen Sun</a>, <a href="/search/eess?searchtype=author&query=Lei%2C+J">Jiahe Lei</a>, <a href="/search/eess?searchtype=author&query=Lin%2C+H">Hongzhan Lin</a>, <a href="/search/eess?searchtype=author&query=Tan%2C+X">Xu Tan</a>, <a href="/search/eess?searchtype=author&query=Dai%2C+Z">Zheqi Dai</a>, <a href="/search/eess?searchtype=author&query=Kong%2C+Q">Qiuqiang Kong</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+J">Jianyi Chen</a>, <a href="/search/eess?searchtype=author&query=Pan%2C+J">Jiahao Pan</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+Q">Qifeng Liu</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+Y">Yike Guo</a>, <a href="/search/eess?searchtype=author&query=Xue%2C+W">Wei Xue</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.17175v3-abstract-short" style="display: inline;"> Recent advancements in audio generation have been significantly propelled by the capabilities of Large Language Models (LLMs). The existing research on audio LLM has primarily focused on enhancing the architecture and scale of audio language models, as well as leveraging larger datasets, and generally, acoustic codecs, such as EnCodec, are used for audio tokenization. However, these codecs were or… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.17175v3-abstract-full').style.display = 'inline'; document.getElementById('2408.17175v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.17175v3-abstract-full" style="display: none;"> Recent advancements in audio generation have been significantly propelled by the capabilities of Large Language Models (LLMs). The existing research on audio LLM has primarily focused on enhancing the architecture and scale of audio language models, as well as leveraging larger datasets, and generally, acoustic codecs, such as EnCodec, are used for audio tokenization. However, these codecs were originally designed for audio compression, which may lead to suboptimal performance in the context of audio LLM. Our research aims to address the shortcomings of current audio LLM codecs, particularly their challenges in maintaining semantic integrity in generated audio. For instance, existing methods like VALL-E, which condition acoustic token generation on text transcriptions, often suffer from content inaccuracies and elevated word error rates (WER) due to semantic misinterpretations of acoustic tokens, resulting in word skipping and errors. To overcome these issues, we propose a straightforward yet effective approach called X-Codec. X-Codec incorporates semantic features from a pre-trained semantic encoder before the Residual Vector Quantization (RVQ) stage and introduces a semantic reconstruction loss after RVQ. By enhancing the semantic ability of the codec, X-Codec significantly reduces WER in speech synthesis tasks and extends these benefits to non-speech applications, including music and sound generation. Our experiments in text-to-speech, music continuation, and text-to-sound tasks demonstrate that integrating semantic information substantially improves the overall performance of language models in audio generation. Our code and demo are available (Demo: https://x-codec-audio.github.io Code: https://github.com/zhenye234/xcodec) <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.17175v3-abstract-full').style.display = 'none'; document.getElementById('2408.17175v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.13056">arXiv:2408.13056</a> <span> [<a href="https://arxiv.org/pdf/2408.13056">pdf</a>, <a href="https://arxiv.org/format/2408.13056">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> GNSS Interference Classification Using Federated Reservoir Computing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ye%2C+Z">Ziqiang Ye</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+Y">Yulan Gao</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+X">Xinyue Liu</a>, <a href="/search/eess?searchtype=author&query=Xiao%2C+Y">Yue Xiao</a>, <a href="/search/eess?searchtype=author&query=Xiao%2C+M">Ming Xiao</a>, <a href="/search/eess?searchtype=author&query=Zammit%2C+S">Saviour Zammit</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.13056v1-abstract-short" style="display: inline;"> The expanding use of Unmanned Aerial Vehicles (UAVs) in vital areas like traffic management, surveillance, and environmental monitoring highlights the need for robust communication and navigation systems. Particularly vulnerable are Global Navigation Satellite Systems (GNSS), which face a spectrum of interference and jamming threats that can significantly undermine their performance. While traditi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.13056v1-abstract-full').style.display = 'inline'; document.getElementById('2408.13056v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.13056v1-abstract-full" style="display: none;"> The expanding use of Unmanned Aerial Vehicles (UAVs) in vital areas like traffic management, surveillance, and environmental monitoring highlights the need for robust communication and navigation systems. Particularly vulnerable are Global Navigation Satellite Systems (GNSS), which face a spectrum of interference and jamming threats that can significantly undermine their performance. While traditional deep learning approaches are adept at mitigating these issues, they often fall short for UAV applications due to significant computational demands and the complexities of managing large, centralized datasets. In response, this paper introduces Federated Reservoir Computing (FedRC) as a potent and efficient solution tailored to enhance interference classification in GNSS systems used by UAVs. Our experimental results demonstrate that FedRC not only achieves faster convergence but also sustains lower loss levels than traditional models, highlighting its exceptional adaptability and operational efficiency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.13056v1-abstract-full').style.display = 'none'; document.getElementById('2408.13056v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.08242">arXiv:2408.08242</a> <span> [<a href="https://arxiv.org/pdf/2408.08242">pdf</a>, <a href="https://arxiv.org/ps/2408.08242">ps</a>, <a href="https://arxiv.org/format/2408.08242">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> A Conflicts-free, Speed-lossless KAN-based Reinforcement Learning Decision System for Interactive Driving in Roundabouts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Lin%2C+Z">Zhihao Lin</a>, <a href="/search/eess?searchtype=author&query=Tian%2C+Z">Zhen Tian</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Q">Qi Zhang</a>, <a href="/search/eess?searchtype=author&query=Ye%2C+Z">Ziyang Ye</a>, <a href="/search/eess?searchtype=author&query=Zhuang%2C+H">Hanyang Zhuang</a>, <a href="/search/eess?searchtype=author&query=Lan%2C+J">Jianglin Lan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.08242v1-abstract-short" style="display: inline;"> Safety and efficiency are crucial for autonomous driving in roundabouts, especially in the context of mixed traffic where autonomous vehicles (AVs) and human-driven vehicles coexist. This paper introduces a learning-based algorithm tailored to foster safe and efficient driving behaviors across varying levels of traffic flows in roundabouts. The proposed algorithm employs a deep Q-learning network… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.08242v1-abstract-full').style.display = 'inline'; document.getElementById('2408.08242v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.08242v1-abstract-full" style="display: none;"> Safety and efficiency are crucial for autonomous driving in roundabouts, especially in the context of mixed traffic where autonomous vehicles (AVs) and human-driven vehicles coexist. This paper introduces a learning-based algorithm tailored to foster safe and efficient driving behaviors across varying levels of traffic flows in roundabouts. The proposed algorithm employs a deep Q-learning network to effectively learn safe and efficient driving strategies in complex multi-vehicle roundabouts. Additionally, a KAN (Kolmogorov-Arnold network) enhances the AVs' ability to learn their surroundings robustly and precisely. An action inspector is integrated to replace dangerous actions to avoid collisions when the AV interacts with the environment, and a route planner is proposed to enhance the driving efficiency and safety of the AVs. Moreover, a model predictive control is adopted to ensure stability and precision of the driving actions. The results show that our proposed system consistently achieves safe and efficient driving whilst maintaining a stable training process, as evidenced by the smooth convergence of the reward function and the low variance in the training curves across various traffic flows. Compared to state-of-the-art benchmarks, the proposed algorithm achieves a lower number of collisions and reduced travel time to destination. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.08242v1-abstract-full').style.display = 'none'; document.getElementById('2408.08242v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages, 12 figures, submitted to an IEEE journal</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.04708">arXiv:2408.04708</a> <span> [<a href="https://arxiv.org/pdf/2408.04708">pdf</a>, <a href="https://arxiv.org/format/2408.04708">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> MulliVC: Multi-lingual Voice Conversion With Cycle Consistency </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Huang%2C+J">Jiawei Huang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+C">Chen Zhang</a>, <a href="/search/eess?searchtype=author&query=Ren%2C+Y">Yi Ren</a>, <a href="/search/eess?searchtype=author&query=Jiang%2C+Z">Ziyue Jiang</a>, <a href="/search/eess?searchtype=author&query=Ye%2C+Z">Zhenhui Ye</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+J">Jinglin Liu</a>, <a href="/search/eess?searchtype=author&query=He%2C+J">Jinzheng He</a>, <a href="/search/eess?searchtype=author&query=Yin%2C+X">Xiang Yin</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+Z">Zhou Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.04708v1-abstract-short" style="display: inline;"> Voice conversion aims to modify the source speaker's voice to resemble the target speaker while preserving the original speech content. Despite notable advancements in voice conversion these days, multi-lingual voice conversion (including both monolingual and cross-lingual scenarios) has yet to be extensively studied. It faces two main challenges: 1) the considerable variability in prosody and art… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.04708v1-abstract-full').style.display = 'inline'; document.getElementById('2408.04708v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.04708v1-abstract-full" style="display: none;"> Voice conversion aims to modify the source speaker's voice to resemble the target speaker while preserving the original speech content. Despite notable advancements in voice conversion these days, multi-lingual voice conversion (including both monolingual and cross-lingual scenarios) has yet to be extensively studied. It faces two main challenges: 1) the considerable variability in prosody and articulation habits across languages; and 2) the rarity of paired multi-lingual datasets from the same speaker. In this paper, we propose MulliVC, a novel voice conversion system that only converts timbre and keeps original content and source language prosody without multi-lingual paired data. Specifically, each training step of MulliVC contains three substeps: In step one the model is trained with monolingual speech data; then, steps two and three take inspiration from back translation, construct a cyclical process to disentangle the timbre and other information (content, prosody, and other language-related information) in the absence of multi-lingual data from the same speaker. Both objective and subjective results indicate that MulliVC significantly surpasses other methods in both monolingual and cross-lingual contexts, demonstrating the system's efficacy and the viability of the three-step approach with cycle consistency. Audio samples can be found on our demo page (mullivc.github.io). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.04708v1-abstract-full').style.display = 'none'; document.getElementById('2408.04708v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.05928">arXiv:2407.05928</a> <span> [<a href="https://arxiv.org/pdf/2407.05928">pdf</a>, <a href="https://arxiv.org/format/2407.05928">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> CA-FedRC: Codebook Adaptation via Federated Reservoir Computing in 5G NR </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ye%2C+Z">Ziqiang Ye</a>, <a href="/search/eess?searchtype=author&query=Liao%2C+S">Sikai Liao</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+Y">Yulan Gao</a>, <a href="/search/eess?searchtype=author&query=Fang%2C+S">Shu Fang</a>, <a href="/search/eess?searchtype=author&query=Xiao%2C+Y">Yue Xiao</a>, <a href="/search/eess?searchtype=author&query=Xiao%2C+M">Ming Xiao</a>, <a href="/search/eess?searchtype=author&query=Zammit%2C+S">Saviour Zammit</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.05928v1-abstract-short" style="display: inline;"> With the burgeon deployment of the fifth-generation new radio (5G NR) networks, the codebook plays a crucial role in enabling the base station (BS) to acquire the channel state information (CSI). Different 5G NR codebooks incur varying overheads and exhibit performance disparities under diverse channel conditions, necessitating codebook adaptation based on channel conditions to reduce feedback ove… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.05928v1-abstract-full').style.display = 'inline'; document.getElementById('2407.05928v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.05928v1-abstract-full" style="display: none;"> With the burgeon deployment of the fifth-generation new radio (5G NR) networks, the codebook plays a crucial role in enabling the base station (BS) to acquire the channel state information (CSI). Different 5G NR codebooks incur varying overheads and exhibit performance disparities under diverse channel conditions, necessitating codebook adaptation based on channel conditions to reduce feedback overhead while enhancing performance. However, existing methods of 5G NR codebooks adaptation require significant overhead for model training and feedback or fall short in performance. To address these limitations, this letter introduces a federated reservoir computing framework designed for efficient codebook adaptation in computationally and feedback resource-constrained mobile devices. This framework utilizes a novel series of indicators as input training data, striking an effective balance between performance and feedback overhead. Compared to conventional models, the proposed codebook adaptation via federated reservoir computing (CA-FedRC), achieves rapid convergence and significant loss reduction in both speed and accuracy. Extensive simulations under various channel conditions demonstrate that our algorithm not only reduces resource consumption of users but also accurately identifies channel types, thereby optimizing the trade-off between spectrum efficiency, computational complexity, and feedback overhead. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.05928v1-abstract-full').style.display = 'none'; document.getElementById('2407.05928v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.05571">arXiv:2407.05571</a> <span> [<a href="https://arxiv.org/pdf/2407.05571">pdf</a>, <a href="https://arxiv.org/format/2407.05571">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Cost-Efficient Computation Offloading in SAGIN: A Deep Reinforcement Learning and Perception-Aided Approach </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Gao%2C+Y">Yulan Gao</a>, <a href="/search/eess?searchtype=author&query=Ye%2C+Z">Ziqiang Ye</a>, <a href="/search/eess?searchtype=author&query=Yu%2C+H">Han Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.05571v1-abstract-short" style="display: inline;"> The Space-Air-Ground Integrated Network (SAGIN), crucial to the advancement of sixth-generation (6G) technology, plays a key role in ensuring universal connectivity, particularly by addressing the communication needs of remote areas lacking cellular network infrastructure. This paper delves into the role of unmanned aerial vehicles (UAVs) within SAGIN, where they act as a control layer owing to th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.05571v1-abstract-full').style.display = 'inline'; document.getElementById('2407.05571v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.05571v1-abstract-full" style="display: none;"> The Space-Air-Ground Integrated Network (SAGIN), crucial to the advancement of sixth-generation (6G) technology, plays a key role in ensuring universal connectivity, particularly by addressing the communication needs of remote areas lacking cellular network infrastructure. This paper delves into the role of unmanned aerial vehicles (UAVs) within SAGIN, where they act as a control layer owing to their adaptable deployment capabilities and their intermediary role. Equipped with millimeter-wave (mmWave) radar and vision sensors, these UAVs are capable of acquiring multi-source data, which helps to diminish uncertainty and enhance the accuracy of decision-making. Concurrently, UAVs collect tasks requiring computing resources from their coverage areas, originating from a variety of mobile devices moving at different speeds. These tasks are then allocated to ground base stations (BSs), low-earth-orbit (LEO) satellite, and local processing units to improve processing efficiency. Amidst this framework, our study concentrates on devising dynamic strategies for facilitating task hosting between mobile devices and UAVs, offloading computations, managing associations between UAVs and BSs, and allocating computing resources. The objective is to minimize the time-averaged network cost, considering the uncertainty of device locations, speeds, and even types. To tackle these complexities, we propose a deep reinforcement learning and perception-aided online approach (DRL-and-Perception-aided Approach) for this joint optimization in SAGIN, tailored for an environment filled with uncertainties. The effectiveness of our proposed approach is validated through extensive numerical simulations, which quantify its performance relative to various network parameters. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.05571v1-abstract-full').style.display = 'none'; document.getElementById('2407.05571v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.16102">arXiv:2406.16102</a> <span> [<a href="https://arxiv.org/pdf/2406.16102">pdf</a>, <a href="https://arxiv.org/format/2406.16102">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Federated Transfer Learning Aided Interference Classification in GNSS Signals </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Jiang%2C+M">Min Jiang</a>, <a href="/search/eess?searchtype=author&query=Ye%2C+Z">Ziqiang Ye</a>, <a href="/search/eess?searchtype=author&query=Xiao%2C+Y">Yue Xiao</a>, <a href="/search/eess?searchtype=author&query=Gou%2C+X">Xiaogang Gou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.16102v1-abstract-short" style="display: inline;"> This study delves into the classification of interference signals to global navigation satellite systems (GNSS) stemming from mobile jammers such as unmanned aerial vehicles (UAVs) across diverse wireless communication zones, employing federated learning (FL) and transfer learning (TL). Specifically, we employ a neural network classifier, enhanced with FL to decentralize data processing and TL to… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.16102v1-abstract-full').style.display = 'inline'; document.getElementById('2406.16102v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.16102v1-abstract-full" style="display: none;"> This study delves into the classification of interference signals to global navigation satellite systems (GNSS) stemming from mobile jammers such as unmanned aerial vehicles (UAVs) across diverse wireless communication zones, employing federated learning (FL) and transfer learning (TL). Specifically, we employ a neural network classifier, enhanced with FL to decentralize data processing and TL to hasten the training process, aiming to improve interference classification accuracy while preserving data privacy. Our evaluations span multiple data scenarios, incorporating both independent and identically distributed (IID) and non-identically distributed (non-IID), to gauge the performance of our approach under different interference conditions. Our results indicate an improvement of approximately $8\%$ in classification accuracy compared to basic convolutional neural network (CNN) model, accompanied by expedited convergence in networks utilizing pre-trained models. Additionally, the implementation of FL not only developed privacy but also matched the robustness of centralized learning methods, particularly under IID scenarios. Moreover, the federated averaging (FedAvg) algorithm effectively manages regional interference variability, thereby enhancing the regional communication performance indicator, $C/N_0$, by roughly $5\text{dB}\cdot \text{Hz}$ compared to isolated setups. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.16102v1-abstract-full').style.display = 'none'; document.getElementById('2406.16102v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">6 pages, 5 figures, conference accepted</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.07682">arXiv:2405.07682</a> <span> [<a href="https://arxiv.org/pdf/2405.07682">pdf</a>, <a href="https://arxiv.org/format/2405.07682">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> FastSAG: Towards Fast Non-Autoregressive Singing Accompaniment Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Chen%2C+J">Jianyi Chen</a>, <a href="/search/eess?searchtype=author&query=Xue%2C+W">Wei Xue</a>, <a href="/search/eess?searchtype=author&query=Tan%2C+X">Xu Tan</a>, <a href="/search/eess?searchtype=author&query=Ye%2C+Z">Zhen Ye</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+Q">Qifeng Liu</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+Y">Yike Guo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.07682v1-abstract-short" style="display: inline;"> Singing Accompaniment Generation (SAG), which generates instrumental music to accompany input vocals, is crucial to developing human-AI symbiotic art creation systems. The state-of-the-art method, SingSong, utilizes a multi-stage autoregressive (AR) model for SAG, however, this method is extremely slow as it generates semantic and acoustic tokens recursively, and this makes it impossible for real-… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.07682v1-abstract-full').style.display = 'inline'; document.getElementById('2405.07682v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.07682v1-abstract-full" style="display: none;"> Singing Accompaniment Generation (SAG), which generates instrumental music to accompany input vocals, is crucial to developing human-AI symbiotic art creation systems. The state-of-the-art method, SingSong, utilizes a multi-stage autoregressive (AR) model for SAG, however, this method is extremely slow as it generates semantic and acoustic tokens recursively, and this makes it impossible for real-time applications. In this paper, we aim to develop a Fast SAG method that can create high-quality and coherent accompaniments. A non-AR diffusion-based framework is developed, which by carefully designing the conditions inferred from the vocal signals, generates the Mel spectrogram of the target accompaniment directly. With diffusion and Mel spectrogram modeling, the proposed method significantly simplifies the AR token-based SingSong framework, and largely accelerates the generation. We also design semantic projection, prior projection blocks as well as a set of loss functions, to ensure the generated accompaniment has semantic and rhythm coherence with the vocal signal. By intensive experimental studies, we demonstrate that the proposed method can generate better samples than SingSong, and accelerate the generation by at least 30 times. Audio samples and code are available at https://fastsag.github.io/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.07682v1-abstract-full').style.display = 'none'; document.getElementById('2405.07682v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">IJCAI 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.00682">arXiv:2405.00682</a> <span> [<a href="https://arxiv.org/pdf/2405.00682">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SynthBrainGrow: Synthetic Diffusion Brain Aging for Longitudinal MRI Data Generation in Young People </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zapaishchykova%2C+A">Anna Zapaishchykova</a>, <a href="/search/eess?searchtype=author&query=Kann%2C+B+H">Benjamin H. Kann</a>, <a href="/search/eess?searchtype=author&query=Tak%2C+D">Divyanshu Tak</a>, <a href="/search/eess?searchtype=author&query=Ye%2C+Z">Zezhong Ye</a>, <a href="/search/eess?searchtype=author&query=Haas-Kogan%2C+D+A">Daphne A. Haas-Kogan</a>, <a href="/search/eess?searchtype=author&query=Aerts%2C+H+J+W+L">Hugo J. W. L. Aerts</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.00682v1-abstract-short" style="display: inline;"> Synthetic longitudinal brain MRI simulates brain aging and would enable more efficient research on neurodevelopmental and neurodegenerative conditions. Synthetically generated, age-adjusted brain images could serve as valuable alternatives to costly longitudinal imaging acquisitions, serve as internal controls for studies looking at the effects of environmental or therapeutic modifiers on brain de… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.00682v1-abstract-full').style.display = 'inline'; document.getElementById('2405.00682v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.00682v1-abstract-full" style="display: none;"> Synthetic longitudinal brain MRI simulates brain aging and would enable more efficient research on neurodevelopmental and neurodegenerative conditions. Synthetically generated, age-adjusted brain images could serve as valuable alternatives to costly longitudinal imaging acquisitions, serve as internal controls for studies looking at the effects of environmental or therapeutic modifiers on brain development, and allow data augmentation for diverse populations. In this paper, we present a diffusion-based approach called SynthBrainGrow for synthetic brain aging with a two-year step. To validate the feasibility of using synthetically-generated data on downstream tasks, we compared structural volumetrics of two-year-aged brains against synthetically-aged brain MRI. Results show that SynthBrainGrow can accurately capture substructure volumetrics and simulate structural changes such as ventricle enlargement and cortical thinning. Our approach provides a novel way to generate longitudinal brain datasets from cross-sectional data to enable augmented training and benchmarking of computational tools for analyzing lifespan trajectories. This work signifies an important advance in generative modeling to synthesize realistic longitudinal data with limited lifelong MRI scans. The code is available at XXX. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.00682v1-abstract-full').style.display = 'none'; document.getElementById('2405.00682v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.14700">arXiv:2404.14700</a> <span> [<a href="https://arxiv.org/pdf/2404.14700">pdf</a>, <a href="https://arxiv.org/format/2404.14700">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> FlashSpeech: Efficient Zero-Shot Speech Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ye%2C+Z">Zhen Ye</a>, <a href="/search/eess?searchtype=author&query=Ju%2C+Z">Zeqian Ju</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+H">Haohe Liu</a>, <a href="/search/eess?searchtype=author&query=Tan%2C+X">Xu Tan</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+J">Jianyi Chen</a>, <a href="/search/eess?searchtype=author&query=Lu%2C+Y">Yiwen Lu</a>, <a href="/search/eess?searchtype=author&query=Sun%2C+P">Peiwen Sun</a>, <a href="/search/eess?searchtype=author&query=Pan%2C+J">Jiahao Pan</a>, <a href="/search/eess?searchtype=author&query=Bian%2C+W">Weizhen Bian</a>, <a href="/search/eess?searchtype=author&query=He%2C+S">Shulin He</a>, <a href="/search/eess?searchtype=author&query=Xue%2C+W">Wei Xue</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+Q">Qifeng Liu</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+Y">Yike Guo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.14700v4-abstract-short" style="display: inline;"> Recent progress in large-scale zero-shot speech synthesis has been significantly advanced by language models and diffusion models. However, the generation process of both methods is slow and computationally intensive. Efficient speech synthesis using a lower computing budget to achieve quality on par with previous work remains a significant challenge. In this paper, we present FlashSpeech, a large… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.14700v4-abstract-full').style.display = 'inline'; document.getElementById('2404.14700v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.14700v4-abstract-full" style="display: none;"> Recent progress in large-scale zero-shot speech synthesis has been significantly advanced by language models and diffusion models. However, the generation process of both methods is slow and computationally intensive. Efficient speech synthesis using a lower computing budget to achieve quality on par with previous work remains a significant challenge. In this paper, we present FlashSpeech, a large-scale zero-shot speech synthesis system with approximately 5\% of the inference time compared with previous work. FlashSpeech is built on the latent consistency model and applies a novel adversarial consistency training approach that can train from scratch without the need for a pre-trained diffusion model as the teacher. Furthermore, a new prosody generator module enhances the diversity of prosody, making the rhythm of the speech sound more natural. The generation processes of FlashSpeech can be achieved efficiently with one or two sampling steps while maintaining high audio quality and high similarity to the audio prompt for zero-shot speech generation. Our experimental results demonstrate the superior performance of FlashSpeech. Notably, FlashSpeech can be about 20 times faster than other zero-shot speech synthesis systems while maintaining comparable performance in terms of voice quality and similarity. Furthermore, FlashSpeech demonstrates its versatility by efficiently performing tasks like voice conversion, speech editing, and diverse speech sampling. Audio samples can be found in https://flashspeech.github.io/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.14700v4-abstract-full').style.display = 'none'; document.getElementById('2404.14700v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Efficient zero-shot speech synthesis</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.14866">arXiv:2403.14866</a> <span> [<a href="https://arxiv.org/pdf/2403.14866">pdf</a>, <a href="https://arxiv.org/format/2403.14866">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Joint Planning of Charging Stations and Power Systems for Heavy-Duty Drayage Trucks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ye%2C+Z">Zuzhao Ye</a>, <a href="/search/eess?searchtype=author&query=Yu%2C+N">Nanpeng Yu</a>, <a href="/search/eess?searchtype=author&query=Wei%2C+R">Ran Wei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.14866v1-abstract-short" style="display: inline;"> As global concerns about climate change intensify, the transition towards zero-emission freight is becoming increasingly vital. Drayage is an important segment of the freight system, typically involving the transport of goods from seaports or intermodal terminals to nearby warehouses. This sector significantly contributes to not only greenhouse gas emissions, but also pollution in densely populate… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.14866v1-abstract-full').style.display = 'inline'; document.getElementById('2403.14866v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.14866v1-abstract-full" style="display: none;"> As global concerns about climate change intensify, the transition towards zero-emission freight is becoming increasingly vital. Drayage is an important segment of the freight system, typically involving the transport of goods from seaports or intermodal terminals to nearby warehouses. This sector significantly contributes to not only greenhouse gas emissions, but also pollution in densely populated areas. This study presents a holistic optimization model designed for an efficient transition to zero-emission drayage, offering cost-effective strategies for the coordinated investment planning for power systems, charging infrastructure, and electric drayage trucks. The model is validated in the Greater Los Angeles area, where regulatory goals are among the most ambitious. Furthermore, the model's design allows for easy adaptation to other regions. By focusing on drayage trucks, this study also paves the way for future research into other freight categories, establishing a foundation for a more extensive exploration in this field. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.14866v1-abstract-full').style.display = 'none'; document.getElementById('2403.14866v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">34 pages, 10 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.16619">arXiv:2402.16619</a> <span> [<a href="https://arxiv.org/pdf/2402.16619">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Medical Physics">physics.med-ph</span> </div> </div> <p class="title is-5 mathjax"> Magnetic resonance delta radiomics to track radiation response in lung tumors receiving stereotactic MRI-guided radiotherapy </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zha%2C+Y">Yining Zha</a>, <a href="/search/eess?searchtype=author&query=Kann%2C+B+H">Benjamin H. Kann</a>, <a href="/search/eess?searchtype=author&query=Ye%2C+Z">Zezhong Ye</a>, <a href="/search/eess?searchtype=author&query=Zapaishchykova%2C+A">Anna Zapaishchykova</a>, <a href="/search/eess?searchtype=author&query=He%2C+J">John He</a>, <a href="/search/eess?searchtype=author&query=Hsu%2C+S">Shu-Hui Hsu</a>, <a href="/search/eess?searchtype=author&query=Leeman%2C+J+E">Jonathan E. Leeman</a>, <a href="/search/eess?searchtype=author&query=Fitzgerald%2C+K+J">Kelly J. Fitzgerald</a>, <a href="/search/eess?searchtype=author&query=Kozono%2C+D+E">David E. Kozono</a>, <a href="/search/eess?searchtype=author&query=Mak%2C+R+H">Raymond H. Mak</a>, <a href="/search/eess?searchtype=author&query=Aerts%2C+H+J+W+L">Hugo J. W. L. Aerts</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.16619v1-abstract-short" style="display: inline;"> Introduction: Lung cancer is a leading cause of cancer-related mortality, and stereotactic body radiotherapy (SBRT) has become a standard treatment for early-stage lung cancer. However, the heterogeneous response to radiation at the tumor level poses challenges. Currently, standardized dosage regimens lack adaptation based on individual patient or tumor characteristics. Thus, we explore the potent… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.16619v1-abstract-full').style.display = 'inline'; document.getElementById('2402.16619v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.16619v1-abstract-full" style="display: none;"> Introduction: Lung cancer is a leading cause of cancer-related mortality, and stereotactic body radiotherapy (SBRT) has become a standard treatment for early-stage lung cancer. However, the heterogeneous response to radiation at the tumor level poses challenges. Currently, standardized dosage regimens lack adaptation based on individual patient or tumor characteristics. Thus, we explore the potential of delta radiomics from on-treatment magnetic resonance (MR) imaging to track radiation dose response, inform personalized radiotherapy dosing, and predict outcomes. Methods: A retrospective study of 47 MR-guided lung SBRT treatments for 39 patients was conducted. Radiomic features were extracted using Pyradiomics, and stability was evaluated temporally and spatially. Delta radiomics were correlated with radiation dose delivery and assessed for associations with tumor control and survival with Cox regressions. Results: Among 107 features, 49 demonstrated temporal stability, and 57 showed spatial stability. Fifteen stable and non-collinear features were analyzed. Median Skewness and surface to volume ratio decreased with radiation dose fraction delivery, while coarseness and 90th percentile values increased. Skewness had the largest relative median absolute changes (22%-45%) per fraction from baseline and was associated with locoregional failure (p=0.012) by analysis of covariance. Skewness, Elongation, and Flatness were significantly associated with local recurrence-free survival, while tumor diameter and volume were not. Conclusions: Our study establishes the feasibility and stability of delta radiomics analysis for MR-guided lung SBRT. Findings suggest that MR delta radiomics can capture short-term radiographic manifestations of intra-tumoral radiation effect. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.16619v1-abstract-full').style.display = 'none'; document.getElementById('2402.16619v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.09120">arXiv:2402.09120</a> <span> [<a href="https://arxiv.org/pdf/2402.09120">pdf</a>, <a href="https://arxiv.org/format/2402.09120">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> Joint Communication and Sensing for 6G -- A Cross-Layer Perspective </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wymeersch%2C+H">Henk Wymeersch</a>, <a href="/search/eess?searchtype=author&query=Saleh%2C+S">Sharief Saleh</a>, <a href="/search/eess?searchtype=author&query=Nimr%2C+A">Ahmad Nimr</a>, <a href="/search/eess?searchtype=author&query=Halili%2C+R">Rreze Halili</a>, <a href="/search/eess?searchtype=author&query=Berkvens%2C+R">Rafael Berkvens</a>, <a href="/search/eess?searchtype=author&query=Moghaddam%2C+M+H">Mohammad H. Moghaddam</a>, <a href="/search/eess?searchtype=author&query=Mateos-Ramos%2C+J+M">Jos茅 Miguel Mateos-Ramos</a>, <a href="/search/eess?searchtype=author&query=Stavridis%2C+A">Athanasios Stavridis</a>, <a href="/search/eess?searchtype=author&query=W%C3%A4nstedt%2C+S">Stefan W盲nstedt</a>, <a href="/search/eess?searchtype=author&query=Barmpounakis%2C+S">Sokratis Barmpounakis</a>, <a href="/search/eess?searchtype=author&query=Priyanto%2C+B">Basuki Priyanto</a>, <a href="/search/eess?searchtype=author&query=Beale%2C+M">Martin Beale</a>, <a href="/search/eess?searchtype=author&query=van+de+Beek%2C+J">Jaap van de Beek</a>, <a href="/search/eess?searchtype=author&query=Ye%2C+Z">Zi Ye</a>, <a href="/search/eess?searchtype=author&query=Manalastas%2C+M">Marvin Manalastas</a>, <a href="/search/eess?searchtype=author&query=Kousaridas%2C+A">Apostolos Kousaridas</a>, <a href="/search/eess?searchtype=author&query=Fettweis%2C+G+P">Gerhard P. Fettweis</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.09120v1-abstract-short" style="display: inline;"> As 6G emerges, cellular systems are envisioned to integrate sensing with communication capabilities, leading to multi-faceted communication and sensing (JCAS). This paper presents a comprehensive cross-layer overview of the Hexa-X-II project's endeavors in JCAS, aligning 6G use cases with service requirements and pinpointing distinct scenarios that bridge communication and sensing. This work relat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.09120v1-abstract-full').style.display = 'inline'; document.getElementById('2402.09120v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.09120v1-abstract-full" style="display: none;"> As 6G emerges, cellular systems are envisioned to integrate sensing with communication capabilities, leading to multi-faceted communication and sensing (JCAS). This paper presents a comprehensive cross-layer overview of the Hexa-X-II project's endeavors in JCAS, aligning 6G use cases with service requirements and pinpointing distinct scenarios that bridge communication and sensing. This work relates to these scenarios through the lens of the cross-layer physical and networking domains, covering models, deployments, resource allocation, storage challenges, computational constraints, interfaces, and innovative functions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.09120v1-abstract-full').style.display = 'none'; document.getElementById('2402.09120v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">6 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.17759">arXiv:2401.17759</a> <span> [<a href="https://arxiv.org/pdf/2401.17759">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1016/j.autcon.2024.105955">10.1016/j.autcon.2024.105955 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Rapid post-disaster infrastructure damage characterisation enabled by remote sensing and deep learning technologies -- a tiered approach </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Kopiika%2C+N">Nadiia Kopiika</a>, <a href="/search/eess?searchtype=author&query=Karavias%2C+A">Andreas Karavias</a>, <a href="/search/eess?searchtype=author&query=Krassakis%2C+P">Pavlos Krassakis</a>, <a href="/search/eess?searchtype=author&query=Ye%2C+Z">Zehao Ye</a>, <a href="/search/eess?searchtype=author&query=Ninic%2C+J">Jelena Ninic</a>, <a href="/search/eess?searchtype=author&query=Shakhovska%2C+N">Nataliya Shakhovska</a>, <a href="/search/eess?searchtype=author&query=Koukouzas%2C+N">Nikolaos Koukouzas</a>, <a href="/search/eess?searchtype=author&query=Argyroudis%2C+S">Sotirios Argyroudis</a>, <a href="/search/eess?searchtype=author&query=Mitoulis%2C+S">Stergios-Aristoteles Mitoulis</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.17759v4-abstract-short" style="display: inline;"> Critical infrastructure, such as transport networks and bridges, are systematically targeted during wars and suffer damage during extensive natural disasters because it is vital for enabling connectivity and transportation of people and goods, and hence, underpins national and international economic growth. Mass destruction of transport assets, in conjunction with minimal or no accessibility in th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.17759v4-abstract-full').style.display = 'inline'; document.getElementById('2401.17759v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.17759v4-abstract-full" style="display: none;"> Critical infrastructure, such as transport networks and bridges, are systematically targeted during wars and suffer damage during extensive natural disasters because it is vital for enabling connectivity and transportation of people and goods, and hence, underpins national and international economic growth. Mass destruction of transport assets, in conjunction with minimal or no accessibility in the wake of natural and anthropogenic disasters, prevents us from delivering rapid recovery and adaptation. As a result, systemic operability is drastically reduced, leading to low levels of resilience. Thus, there is a need for rapid assessment of its condition to allow for informed decision-making for restoration prioritisation. A solution to this challenge is to use technology that enables stand-off observations. Nevertheless, no methods exist for automated characterisation of damage at multiple scales, i.e. regional (e.g., network), asset (e.g., bridges), and structural (e.g., road pavement) scales. We propose a methodology based on an integrated, multi-scale tiered approach to fill this capability gap. In doing so, we demonstrate how automated damage characterisation can be enabled by fit-for-purpose digital technologies. Next, the methodology is applied and validated to a case study in Ukraine that includes 17 bridges, damaged by human targeted interventions. From regional to component scale, we deploy technology to integrate assessments using Sentinel-1 SAR images, crowdsourced information, and high-resolution images for deep learning to facilitate automatic damage detection and characterisation. For the first time, the interferometric coherence difference and semantic segmentation of images were deployed in a tiered multi-scale approach to improve the reliability of damage characterisations at different scales. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.17759v4-abstract-full').style.display = 'none'; document.getElementById('2401.17759v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">43 pages; 20 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.06897">arXiv:2401.06897</a> <span> [<a href="https://arxiv.org/pdf/2401.06897">pdf</a>, <a href="https://arxiv.org/format/2401.06897">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Maximum-Entropy Adversarial Audio Augmentation for Keyword Spotting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ye%2C+Z">Zuzhao Ye</a>, <a href="/search/eess?searchtype=author&query=Ciccarelli%2C+G">Gregory Ciccarelli</a>, <a href="/search/eess?searchtype=author&query=Kulis%2C+B">Brian Kulis</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.06897v1-abstract-short" style="display: inline;"> Data augmentation is a key tool for improving the performance of deep networks, particularly when there is limited labeled data. In some fields, such as computer vision, augmentation methods have been extensively studied; however, for speech and audio data, there are relatively fewer methods developed. Using adversarial learning as a starting point, we develop a simple and effective augmentation s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.06897v1-abstract-full').style.display = 'inline'; document.getElementById('2401.06897v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.06897v1-abstract-full" style="display: none;"> Data augmentation is a key tool for improving the performance of deep networks, particularly when there is limited labeled data. In some fields, such as computer vision, augmentation methods have been extensively studied; however, for speech and audio data, there are relatively fewer methods developed. Using adversarial learning as a starting point, we develop a simple and effective augmentation strategy based on taking the gradient of the entropy of the outputs with respect to the inputs and then creating new data points by moving in the direction of the gradient to maximize the entropy. We validate its efficacy on several keyword spotting tasks as well as standard audio benchmarks. Our method is straightforward to implement, offering greater computational efficiency than more complex adversarial schemes like GANs. Despite its simplicity, it proves robust and effective, especially when combined with the established SpecAugment technique, leading to enhanced performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.06897v1-abstract-full').style.display = 'none'; document.getElementById('2401.06897v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 2 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.01792">arXiv:2401.01792</a> <span> [<a href="https://arxiv.org/pdf/2401.01792">pdf</a>, <a href="https://arxiv.org/format/2401.01792">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> CoMoSVC: Consistency Model-based Singing Voice Conversion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Lu%2C+Y">Yiwen Lu</a>, <a href="/search/eess?searchtype=author&query=Ye%2C+Z">Zhen Ye</a>, <a href="/search/eess?searchtype=author&query=Xue%2C+W">Wei Xue</a>, <a href="/search/eess?searchtype=author&query=Tan%2C+X">Xu Tan</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+Q">Qifeng Liu</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+Y">Yike Guo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.01792v1-abstract-short" style="display: inline;"> The diffusion-based Singing Voice Conversion (SVC) methods have achieved remarkable performances, producing natural audios with high similarity to the target timbre. However, the iterative sampling process results in slow inference speed, and acceleration thus becomes crucial. In this paper, we propose CoMoSVC, a consistency model-based SVC method, which aims to achieve both high-quality generatio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.01792v1-abstract-full').style.display = 'inline'; document.getElementById('2401.01792v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.01792v1-abstract-full" style="display: none;"> The diffusion-based Singing Voice Conversion (SVC) methods have achieved remarkable performances, producing natural audios with high similarity to the target timbre. However, the iterative sampling process results in slow inference speed, and acceleration thus becomes crucial. In this paper, we propose CoMoSVC, a consistency model-based SVC method, which aims to achieve both high-quality generation and high-speed sampling. A diffusion-based teacher model is first specially designed for SVC, and a student model is further distilled under self-consistency properties to achieve one-step sampling. Experiments on a single NVIDIA GTX4090 GPU reveal that although CoMoSVC has a significantly faster inference speed than the state-of-the-art (SOTA) diffusion-based SVC system, it still achieves comparable or superior conversion performance based on both subjective and objective metrics. Audio samples and codes are available at https://comosvc.github.io/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.01792v1-abstract-full').style.display = 'none'; document.getElementById('2401.01792v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.13227">arXiv:2309.13227</a> <span> [<a href="https://arxiv.org/pdf/2309.13227">pdf</a>, <a href="https://arxiv.org/format/2309.13227">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Importance of negative sampling in weak label learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Shah%2C+A">Ankit Shah</a>, <a href="/search/eess?searchtype=author&query=Tang%2C+F">Fuyu Tang</a>, <a href="/search/eess?searchtype=author&query=Ye%2C+Z">Zelin Ye</a>, <a href="/search/eess?searchtype=author&query=Singh%2C+R">Rita Singh</a>, <a href="/search/eess?searchtype=author&query=Raj%2C+B">Bhiksha Raj</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.13227v1-abstract-short" style="display: inline;"> Weak-label learning is a challenging task that requires learning from data "bags" containing positive and negative instances, but only the bag labels are known. The pool of negative instances is usually larger than positive instances, thus making selecting the most informative negative instance critical for performance. Such a selection strategy for negative instances from each bag is an open prob… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.13227v1-abstract-full').style.display = 'inline'; document.getElementById('2309.13227v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.13227v1-abstract-full" style="display: none;"> Weak-label learning is a challenging task that requires learning from data "bags" containing positive and negative instances, but only the bag labels are known. The pool of negative instances is usually larger than positive instances, thus making selecting the most informative negative instance critical for performance. Such a selection strategy for negative instances from each bag is an open problem that has not been well studied for weak-label learning. In this paper, we study several sampling strategies that can measure the usefulness of negative instances for weak-label learning and select them accordingly. We test our method on CIFAR-10 and AudioSet datasets and show that it improves the weak-label classification performance and reduces the computational cost compared to random sampling methods. Our work reveals that negative instances are not all equally irrelevant, and selecting them wisely can benefit weak-label learning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.13227v1-abstract-full').style.display = 'none'; document.getElementById('2309.13227v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.01480">arXiv:2309.01480</a> <span> [<a href="https://arxiv.org/pdf/2309.01480">pdf</a>, <a href="https://arxiv.org/ps/2309.01480">ps</a>, <a href="https://arxiv.org/format/2309.01480">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> EventTrojan: Manipulating Non-Intrusive Speech Quality Assessment via Imperceptible Events </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ren%2C+Y">Ying Ren</a>, <a href="/search/eess?searchtype=author&query=Shen%2C+K">Kailai Shen</a>, <a href="/search/eess?searchtype=author&query=Ye%2C+Z">Zhe Ye</a>, <a href="/search/eess?searchtype=author&query=Yan%2C+D">Diqun Yan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.01480v2-abstract-short" style="display: inline;"> Non-Intrusive speech quality assessment (NISQA) has gained significant attention for predicting speech's mean opinion score (MOS) without requiring the reference speech. Researchers have gradually started to apply NISQA to various practical scenarios. However, little attention has been paid to the security of NISQA models. Backdoor attacks represent the most serious threat to deep neural networks… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.01480v2-abstract-full').style.display = 'inline'; document.getElementById('2309.01480v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.01480v2-abstract-full" style="display: none;"> Non-Intrusive speech quality assessment (NISQA) has gained significant attention for predicting speech's mean opinion score (MOS) without requiring the reference speech. Researchers have gradually started to apply NISQA to various practical scenarios. However, little attention has been paid to the security of NISQA models. Backdoor attacks represent the most serious threat to deep neural networks (DNNs) due to the fact that backdoors possess a very high attack success rate once embedded. However, existing backdoor attacks assume that the attacker actively feeds samples containing triggers into the model during the inference phase. This is not adapted to the specific scenario of NISQA. And current backdoor attacks on regression tasks lack an objective metric to measure the attack performance. To address these issues, we propose a novel backdoor triggering approach (EventTrojan) that utilizes an event during the usage of the NISQA model as a trigger. Moreover, we innovatively provide an objective metric for backdoor attacks on regression tasks. Extensive experiments on four benchmark datasets demonstrate the effectiveness of the EventTrojan attack. Besides, it also has good resistance to several defense methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.01480v2-abstract-full').style.display = 'none'; document.getElementById('2309.01480v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICME2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.04179">arXiv:2308.04179</a> <span> [<a href="https://arxiv.org/pdf/2308.04179">pdf</a>, <a href="https://arxiv.org/format/2308.04179">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Breaking Speaker Recognition with PaddingBack </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ye%2C+Z">Zhe Ye</a>, <a href="/search/eess?searchtype=author&query=Yan%2C+D">Diqun Yan</a>, <a href="/search/eess?searchtype=author&query=Dong%2C+L">Li Dong</a>, <a href="/search/eess?searchtype=author&query=Shen%2C+K">Kailai Shen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.04179v2-abstract-short" style="display: inline;"> Machine Learning as a Service (MLaaS) has gained popularity due to advancements in Deep Neural Networks (DNNs). However, untrusted third-party platforms have raised concerns about AI security, particularly in backdoor attacks. Recent research has shown that speech backdoors can utilize transformations as triggers, similar to image backdoors. However, human ears can easily be aware of these transfo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.04179v2-abstract-full').style.display = 'inline'; document.getElementById('2308.04179v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.04179v2-abstract-full" style="display: none;"> Machine Learning as a Service (MLaaS) has gained popularity due to advancements in Deep Neural Networks (DNNs). However, untrusted third-party platforms have raised concerns about AI security, particularly in backdoor attacks. Recent research has shown that speech backdoors can utilize transformations as triggers, similar to image backdoors. However, human ears can easily be aware of these transformations, leading to suspicion. In this paper, we propose PaddingBack, an inaudible backdoor attack that utilizes malicious operations to generate poisoned samples, rendering them indistinguishable from clean ones. Instead of using external perturbations as triggers, we exploit the widely-used speech signal operation, padding, to break speaker recognition systems. Experimental results demonstrate the effectiveness of our method, achieving a significant attack success rate while retaining benign accuracy. Furthermore, PaddingBack demonstrates the ability to resist defense methods and maintain its stealthiness against human perception. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.04179v2-abstract-full').style.display = 'none'; document.getElementById('2308.04179v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2307.07218">arXiv:2307.07218</a> <span> [<a href="https://arxiv.org/pdf/2307.07218">pdf</a>, <a href="https://arxiv.org/format/2307.07218">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Mega-TTS 2: Boosting Prompting Mechanisms for Zero-Shot Speech Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Jiang%2C+Z">Ziyue Jiang</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+J">Jinglin Liu</a>, <a href="/search/eess?searchtype=author&query=Ren%2C+Y">Yi Ren</a>, <a href="/search/eess?searchtype=author&query=He%2C+J">Jinzheng He</a>, <a href="/search/eess?searchtype=author&query=Ye%2C+Z">Zhenhui Ye</a>, <a href="/search/eess?searchtype=author&query=Ji%2C+S">Shengpeng Ji</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+Q">Qian Yang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+C">Chen Zhang</a>, <a href="/search/eess?searchtype=author&query=Wei%2C+P">Pengfei Wei</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+C">Chunfeng Wang</a>, <a href="/search/eess?searchtype=author&query=Yin%2C+X">Xiang Yin</a>, <a href="/search/eess?searchtype=author&query=Ma%2C+Z">Zejun Ma</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+Z">Zhou Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2307.07218v4-abstract-short" style="display: inline;"> Zero-shot text-to-speech (TTS) aims to synthesize voices with unseen speech prompts, which significantly reduces the data and computation requirements for voice cloning by skipping the fine-tuning process. However, the prompting mechanisms of zero-shot TTS still face challenges in the following aspects: 1) previous works of zero-shot TTS are typically trained with single-sentence prompts, which si… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.07218v4-abstract-full').style.display = 'inline'; document.getElementById('2307.07218v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2307.07218v4-abstract-full" style="display: none;"> Zero-shot text-to-speech (TTS) aims to synthesize voices with unseen speech prompts, which significantly reduces the data and computation requirements for voice cloning by skipping the fine-tuning process. However, the prompting mechanisms of zero-shot TTS still face challenges in the following aspects: 1) previous works of zero-shot TTS are typically trained with single-sentence prompts, which significantly restricts their performance when the data is relatively sufficient during the inference stage. 2) The prosodic information in prompts is highly coupled with timbre, making it untransferable to each other. This paper introduces Mega-TTS 2, a generic prompting mechanism for zero-shot TTS, to tackle the aforementioned challenges. Specifically, we design a powerful acoustic autoencoder that separately encodes the prosody and timbre information into the compressed latent space while providing high-quality reconstructions. Then, we propose a multi-reference timbre encoder and a prosody latent language model (P-LLM) to extract useful information from multi-sentence prompts. We further leverage the probabilities derived from multiple P-LLM outputs to produce transferable and controllable prosody. Experimental results demonstrate that Mega-TTS 2 could not only synthesize identity-preserving speech with a short prompt of an unseen speaker from arbitrary sources but consistently outperform the fine-tuning method when the volume of data ranges from 10 seconds to 5 minutes. Furthermore, our method enables to transfer various speaking styles to the target timbre in a fine-grained and controlled manner. Audio samples can be found in https://boostprompt.github.io/boostprompt/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.07218v4-abstract-full').style.display = 'none'; document.getElementById('2307.07218v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICLR 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.15875">arXiv:2306.15875</a> <span> [<a href="https://arxiv.org/pdf/2306.15875">pdf</a>, <a href="https://arxiv.org/format/2306.15875">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.21437/Interspeech.2023-733">10.21437/Interspeech.2023-733 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Fake the Real: Backdoor Attack on Deep Speech Classification via Voice Conversion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ye%2C+Z">Zhe Ye</a>, <a href="/search/eess?searchtype=author&query=Mao%2C+T">Terui Mao</a>, <a href="/search/eess?searchtype=author&query=Dong%2C+L">Li Dong</a>, <a href="/search/eess?searchtype=author&query=Yan%2C+D">Diqun Yan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.15875v1-abstract-short" style="display: inline;"> Deep speech classification has achieved tremendous success and greatly promoted the emergence of many real-world applications. However, backdoor attacks present a new security threat to it, particularly with untrustworthy third-party platforms, as pre-defined triggers set by the attacker can activate the backdoor. Most of the triggers in existing speech backdoor attacks are sample-agnostic, and ev… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.15875v1-abstract-full').style.display = 'inline'; document.getElementById('2306.15875v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.15875v1-abstract-full" style="display: none;"> Deep speech classification has achieved tremendous success and greatly promoted the emergence of many real-world applications. However, backdoor attacks present a new security threat to it, particularly with untrustworthy third-party platforms, as pre-defined triggers set by the attacker can activate the backdoor. Most of the triggers in existing speech backdoor attacks are sample-agnostic, and even if the triggers are designed to be unnoticeable, they can still be audible. This work explores a backdoor attack that utilizes sample-specific triggers based on voice conversion. Specifically, we adopt a pre-trained voice conversion model to generate the trigger, ensuring that the poisoned samples does not introduce any additional audible noise. Extensive experiments on two speech classification tasks demonstrate the effectiveness of our attack. Furthermore, we analyzed the specific scenarios that activated the proposed backdoor and verified its resistance against fine-tuning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.15875v1-abstract-full').style.display = 'none'; document.getElementById('2306.15875v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by INTERSPEECH 2023</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Proc. INTERSPEECH 2023, pp. 4923-4927 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.14588">arXiv:2306.14588</a> <span> [<a href="https://arxiv.org/pdf/2306.14588">pdf</a>, <a href="https://arxiv.org/ps/2306.14588">ps</a>, <a href="https://arxiv.org/format/2306.14588">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> Cost-Effective Task Offloading Scheduling for Hybrid Mobile Edge-Quantum Computing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ye%2C+Z">Ziqiang Ye</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+Y">Yulan Gao</a>, <a href="/search/eess?searchtype=author&query=Xiao%2C+Y">Yue Xiao</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+M">Minrui Xu</a>, <a href="/search/eess?searchtype=author&query=Yu%2C+H">Han Yu</a>, <a href="/search/eess?searchtype=author&query=Niyato%2C+D">Dusit Niyato</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.14588v1-abstract-short" style="display: inline;"> In this paper, we aim to address the challenge of hybrid mobile edge-quantum computing (MEQC) for sustainable task offloading scheduling in mobile networks. We develop cost-effective designs for both task offloading mode selection and resource allocation, subject to the individual link latency constraint guarantees for mobile devices, while satisfying the required success ratio for their computati… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.14588v1-abstract-full').style.display = 'inline'; document.getElementById('2306.14588v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.14588v1-abstract-full" style="display: none;"> In this paper, we aim to address the challenge of hybrid mobile edge-quantum computing (MEQC) for sustainable task offloading scheduling in mobile networks. We develop cost-effective designs for both task offloading mode selection and resource allocation, subject to the individual link latency constraint guarantees for mobile devices, while satisfying the required success ratio for their computation tasks. Specifically, this is a time-coupled offloading scheduling optimization problem in need of a computationally affordable and effective solution. To this end, we propose a deep reinforcement learning (DRL)-based Lyapunov approach. More precisely, we reformulate the original time-coupled challenge into a mixed-integer optimization problem by introducing a penalty part in terms of virtual queues constructed by time-coupled constraints to the objective function. Subsequently, a Deep Q-Network (DQN) is adopted for task offloading mode selection. In addition, we design the Deep Deterministic Policy Gradient (DDPG)-based algorithm for partial-task offloading decision-making. Finally, tested in a realistic network setting, extensive experiment results demonstrate that our proposed approach is significantly more cost-effective and sustainable compared to existing methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.14588v1-abstract-full').style.display = 'none'; document.getElementById('2306.14588v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.03509">arXiv:2306.03509</a> <span> [<a href="https://arxiv.org/pdf/2306.03509">pdf</a>, <a href="https://arxiv.org/format/2306.03509">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Mega-TTS: Zero-Shot Text-to-Speech at Scale with Intrinsic Inductive Bias </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Jiang%2C+Z">Ziyue Jiang</a>, <a href="/search/eess?searchtype=author&query=Ren%2C+Y">Yi Ren</a>, <a href="/search/eess?searchtype=author&query=Ye%2C+Z">Zhenhui Ye</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+J">Jinglin Liu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+C">Chen Zhang</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+Q">Qian Yang</a>, <a href="/search/eess?searchtype=author&query=Ji%2C+S">Shengpeng Ji</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+R">Rongjie Huang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+C">Chunfeng Wang</a>, <a href="/search/eess?searchtype=author&query=Yin%2C+X">Xiang Yin</a>, <a href="/search/eess?searchtype=author&query=Ma%2C+Z">Zejun Ma</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+Z">Zhou Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.03509v1-abstract-short" style="display: inline;"> Scaling text-to-speech to a large and wild dataset has been proven to be highly effective in achieving timbre and speech style generalization, particularly in zero-shot TTS. However, previous works usually encode speech into latent using audio codec and use autoregressive language models or diffusion models to generate it, which ignores the intrinsic nature of speech and may lead to inferior or un… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.03509v1-abstract-full').style.display = 'inline'; document.getElementById('2306.03509v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.03509v1-abstract-full" style="display: none;"> Scaling text-to-speech to a large and wild dataset has been proven to be highly effective in achieving timbre and speech style generalization, particularly in zero-shot TTS. However, previous works usually encode speech into latent using audio codec and use autoregressive language models or diffusion models to generate it, which ignores the intrinsic nature of speech and may lead to inferior or uncontrollable results. We argue that speech can be decomposed into several attributes (e.g., content, timbre, prosody, and phase) and each of them should be modeled using a module with appropriate inductive biases. From this perspective, we carefully design a novel and large zero-shot TTS system called Mega-TTS, which is trained with large-scale wild data and models different attributes in different ways: 1) Instead of using latent encoded by audio codec as the intermediate feature, we still choose spectrogram as it separates the phase and other attributes very well. Phase can be appropriately constructed by the GAN-based vocoder and does not need to be modeled by the language model. 2) We model the timbre using global vectors since timbre is a global attribute that changes slowly over time. 3) We further use a VQGAN-based acoustic model to generate the spectrogram and a latent code language model to fit the distribution of prosody, since prosody changes quickly over time in a sentence, and language models can capture both local and long-range dependencies. We scale Mega-TTS to multi-domain datasets with 20K hours of speech and evaluate its performance on unseen speakers. Experimental results demonstrate that Mega-TTS surpasses state-of-the-art TTS systems on zero-shot TTS, speech editing, and cross-lingual TTS tasks, with superior naturalness, robustness, and speaker similarity due to the proper inductive bias of each module. Audio samples are available at https://mega-tts.github.io/demo-page. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.03509v1-abstract-full').style.display = 'none'; document.getElementById('2306.03509v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.03504">arXiv:2306.03504</a> <span> [<a href="https://arxiv.org/pdf/2306.03504">pdf</a>, <a href="https://arxiv.org/format/2306.03504">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Ada-TTA: Towards Adaptive High-Quality Text-to-Talking Avatar Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ye%2C+Z">Zhenhui Ye</a>, <a href="/search/eess?searchtype=author&query=Jiang%2C+Z">Ziyue Jiang</a>, <a href="/search/eess?searchtype=author&query=Ren%2C+Y">Yi Ren</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+J">Jinglin Liu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+C">Chen Zhang</a>, <a href="/search/eess?searchtype=author&query=Yin%2C+X">Xiang Yin</a>, <a href="/search/eess?searchtype=author&query=Ma%2C+Z">Zejun Ma</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+Z">Zhou Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.03504v2-abstract-short" style="display: inline;"> We are interested in a novel task, namely low-resource text-to-talking avatar. Given only a few-minute-long talking person video with the audio track as the training data and arbitrary texts as the driving input, we aim to synthesize high-quality talking portrait videos corresponding to the input text. This task has broad application prospects in the digital human industry but has not been technic… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.03504v2-abstract-full').style.display = 'inline'; document.getElementById('2306.03504v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.03504v2-abstract-full" style="display: none;"> We are interested in a novel task, namely low-resource text-to-talking avatar. Given only a few-minute-long talking person video with the audio track as the training data and arbitrary texts as the driving input, we aim to synthesize high-quality talking portrait videos corresponding to the input text. This task has broad application prospects in the digital human industry but has not been technically achieved yet due to two challenges: (1) It is challenging to mimic the timbre from out-of-domain audio for a traditional multi-speaker Text-to-Speech system. (2) It is hard to render high-fidelity and lip-synchronized talking avatars with limited training data. In this paper, we introduce Adaptive Text-to-Talking Avatar (Ada-TTA), which (1) designs a generic zero-shot multi-speaker TTS model that well disentangles the text content, timbre, and prosody; and (2) embraces recent advances in neural rendering to achieve realistic audio-driven talking face video generation. With these designs, our method overcomes the aforementioned two challenges and achieves to generate identity-preserving speech and realistic talking person video. Experiments demonstrate that our method could synthesize realistic, identity-preserving, and audio-visual synchronized talking avatar videos. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.03504v2-abstract-full').style.display = 'none'; document.getElementById('2306.03504v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICML 2023 Workshop, 6 pages, 3 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.19269">arXiv:2305.19269</a> <span> [<a href="https://arxiv.org/pdf/2305.19269">pdf</a>, <a href="https://arxiv.org/format/2305.19269">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Make-A-Voice: Unified Voice Synthesis With Discrete Representation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Huang%2C+R">Rongjie Huang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+C">Chunlei Zhang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yongqi Wang</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+D">Dongchao Yang</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+L">Luping Liu</a>, <a href="/search/eess?searchtype=author&query=Ye%2C+Z">Zhenhui Ye</a>, <a href="/search/eess?searchtype=author&query=Jiang%2C+Z">Ziyue Jiang</a>, <a href="/search/eess?searchtype=author&query=Weng%2C+C">Chao Weng</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+Z">Zhou Zhao</a>, <a href="/search/eess?searchtype=author&query=Yu%2C+D">Dong Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.19269v1-abstract-short" style="display: inline;"> Various applications of voice synthesis have been developed independently despite the fact that they generate "voice" as output in common. In addition, the majority of voice synthesis models currently rely on annotated audio data, but it is crucial to scale them to self-supervised datasets in order to effectively capture the wide range of acoustic variations present in human voice, including speak… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.19269v1-abstract-full').style.display = 'inline'; document.getElementById('2305.19269v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.19269v1-abstract-full" style="display: none;"> Various applications of voice synthesis have been developed independently despite the fact that they generate "voice" as output in common. In addition, the majority of voice synthesis models currently rely on annotated audio data, but it is crucial to scale them to self-supervised datasets in order to effectively capture the wide range of acoustic variations present in human voice, including speaker identity, emotion, and prosody. In this work, we propose Make-A-Voice, a unified framework for synthesizing and manipulating voice signals from discrete representations. Make-A-Voice leverages a "coarse-to-fine" approach to model the human voice, which involves three stages: 1) semantic stage: model high-level transformation between linguistic content and self-supervised semantic tokens, 2) acoustic stage: introduce varying control signals as acoustic conditions for semantic-to-acoustic modeling, and 3) generation stage: synthesize high-fidelity waveforms from acoustic tokens. Make-A-Voice offers notable benefits as a unified voice synthesis framework: 1) Data scalability: the major backbone (i.e., acoustic and generation stage) does not require any annotations, and thus the training data could be scaled up. 2) Controllability and conditioning flexibility: we investigate different conditioning mechanisms and effectively handle three voice synthesis applications, including text-to-speech (TTS), voice conversion (VC), and singing voice synthesis (SVS) by re-synthesizing the discrete voice representations with prompt guidance. Experimental results demonstrate that Make-A-Voice exhibits superior audio quality and style similarity compared with competitive baseline models. Audio samples are available at https://Make-A-Voice.github.io <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.19269v1-abstract-full').style.display = 'none'; document.getElementById('2305.19269v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.18474">arXiv:2305.18474</a> <span> [<a href="https://arxiv.org/pdf/2305.18474">pdf</a>, <a href="https://arxiv.org/format/2305.18474">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Make-An-Audio 2: Temporal-Enhanced Text-to-Audio Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Huang%2C+J">Jiawei Huang</a>, <a href="/search/eess?searchtype=author&query=Ren%2C+Y">Yi Ren</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+R">Rongjie Huang</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+D">Dongchao Yang</a>, <a href="/search/eess?searchtype=author&query=Ye%2C+Z">Zhenhui Ye</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+C">Chen Zhang</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+J">Jinglin Liu</a>, <a href="/search/eess?searchtype=author&query=Yin%2C+X">Xiang Yin</a>, <a href="/search/eess?searchtype=author&query=Ma%2C+Z">Zejun Ma</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+Z">Zhou Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.18474v1-abstract-short" style="display: inline;"> Large diffusion models have been successful in text-to-audio (T2A) synthesis tasks, but they often suffer from common issues such as semantic misalignment and poor temporal consistency due to limited natural language understanding and data scarcity. Additionally, 2D spatial structures widely used in T2A works lead to unsatisfactory audio quality when generating variable-length audio samples since… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.18474v1-abstract-full').style.display = 'inline'; document.getElementById('2305.18474v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.18474v1-abstract-full" style="display: none;"> Large diffusion models have been successful in text-to-audio (T2A) synthesis tasks, but they often suffer from common issues such as semantic misalignment and poor temporal consistency due to limited natural language understanding and data scarcity. Additionally, 2D spatial structures widely used in T2A works lead to unsatisfactory audio quality when generating variable-length audio samples since they do not adequately prioritize temporal information. To address these challenges, we propose Make-an-Audio 2, a latent diffusion-based T2A method that builds on the success of Make-an-Audio. Our approach includes several techniques to improve semantic alignment and temporal consistency: Firstly, we use pre-trained large language models (LLMs) to parse the text into structured <event & order> pairs for better temporal information capture. We also introduce another structured-text encoder to aid in learning semantic alignment during the diffusion denoising process. To improve the performance of variable length generation and enhance the temporal information extraction, we design a feed-forward Transformer-based diffusion denoiser. Finally, we use LLMs to augment and transform a large amount of audio-label data into audio-text datasets to alleviate the problem of scarcity of temporal data. Extensive experiments show that our method outperforms baseline models in both objective and subjective metrics, and achieves significant gains in temporal information understanding, semantic consistency, and sound quality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.18474v1-abstract-full').style.display = 'none'; document.getElementById('2305.18474v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.15403">arXiv:2305.15403</a> <span> [<a href="https://arxiv.org/pdf/2305.15403">pdf</a>, <a href="https://arxiv.org/format/2305.15403">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> AV-TranSpeech: Audio-Visual Robust Speech-to-Speech Translation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Huang%2C+R">Rongjie Huang</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+H">Huadai Liu</a>, <a href="/search/eess?searchtype=author&query=Cheng%2C+X">Xize Cheng</a>, <a href="/search/eess?searchtype=author&query=Ren%2C+Y">Yi Ren</a>, <a href="/search/eess?searchtype=author&query=Li%2C+L">Linjun Li</a>, <a href="/search/eess?searchtype=author&query=Ye%2C+Z">Zhenhui Ye</a>, <a href="/search/eess?searchtype=author&query=He%2C+J">Jinzheng He</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+L">Lichao Zhang</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+J">Jinglin Liu</a>, <a href="/search/eess?searchtype=author&query=Yin%2C+X">Xiang Yin</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+Z">Zhou Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.15403v1-abstract-short" style="display: inline;"> Direct speech-to-speech translation (S2ST) aims to convert speech from one language into another, and has demonstrated significant progress to date. Despite the recent success, current S2ST models still suffer from distinct degradation in noisy environments and fail to translate visual speech (i.e., the movement of lips and teeth). In this work, we present AV-TranSpeech, the first audio-visual spe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.15403v1-abstract-full').style.display = 'inline'; document.getElementById('2305.15403v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.15403v1-abstract-full" style="display: none;"> Direct speech-to-speech translation (S2ST) aims to convert speech from one language into another, and has demonstrated significant progress to date. Despite the recent success, current S2ST models still suffer from distinct degradation in noisy environments and fail to translate visual speech (i.e., the movement of lips and teeth). In this work, we present AV-TranSpeech, the first audio-visual speech-to-speech (AV-S2ST) translation model without relying on intermediate text. AV-TranSpeech complements the audio stream with visual information to promote system robustness and opens up a host of practical applications: dictation or dubbing archival films. To mitigate the data scarcity with limited parallel AV-S2ST data, we 1) explore self-supervised pre-training with unlabeled audio-visual data to learn contextual representation, and 2) introduce cross-modal distillation with S2ST models trained on the audio-only corpus to further reduce the requirements of visual data. Experimental results on two language pairs demonstrate that AV-TranSpeech outperforms audio-only models under all settings regardless of the type of noise. With low-resource audio-visual data (10h, 30h), cross-modal distillation yields an improvement of 7.6 BLEU on average compared with baselines. Audio samples are available at https://AV-TranSpeech.github.io <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.15403v1-abstract-full').style.display = 'none'; document.getElementById('2305.15403v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ACL 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.13612">arXiv:2305.13612</a> <span> [<a href="https://arxiv.org/pdf/2305.13612">pdf</a>, <a href="https://arxiv.org/format/2305.13612">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> FluentSpeech: Stutter-Oriented Automatic Speech Editing with Context-Aware Diffusion Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Jiang%2C+Z">Ziyue Jiang</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+Q">Qian Yang</a>, <a href="/search/eess?searchtype=author&query=Zuo%2C+J">Jialong Zuo</a>, <a href="/search/eess?searchtype=author&query=Ye%2C+Z">Zhenhui Ye</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+R">Rongjie Huang</a>, <a href="/search/eess?searchtype=author&query=Ren%2C+Y">Yi Ren</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+Z">Zhou Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.13612v1-abstract-short" style="display: inline;"> Stutter removal is an essential scenario in the field of speech editing. However, when the speech recording contains stutters, the existing text-based speech editing approaches still suffer from: 1) the over-smoothing problem in the edited speech; 2) lack of robustness due to the noise introduced by stutter; 3) to remove the stutters, users are required to determine the edited region manually. To… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.13612v1-abstract-full').style.display = 'inline'; document.getElementById('2305.13612v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.13612v1-abstract-full" style="display: none;"> Stutter removal is an essential scenario in the field of speech editing. However, when the speech recording contains stutters, the existing text-based speech editing approaches still suffer from: 1) the over-smoothing problem in the edited speech; 2) lack of robustness due to the noise introduced by stutter; 3) to remove the stutters, users are required to determine the edited region manually. To tackle the challenges in stutter removal, we propose FluentSpeech, a stutter-oriented automatic speech editing model. Specifically, 1) we propose a context-aware diffusion model that iteratively refines the modified mel-spectrogram with the guidance of context features; 2) we introduce a stutter predictor module to inject the stutter information into the hidden sequence; 3) we also propose a stutter-oriented automatic speech editing (SASE) dataset that contains spontaneous speech recordings with time-aligned stutter labels to train the automatic stutter localization model. Experimental results on VCTK and LibriTTS datasets demonstrate that our model achieves state-of-the-art performance on speech editing. Further experiments on our SASE dataset show that FluentSpeech can effectively improve the fluency of stuttering speech in terms of objective and subjective metrics. Code and audio samples can be found at https://github.com/Zain-Jiang/Speech-Editing-Toolkit. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.13612v1-abstract-full').style.display = 'none'; document.getElementById('2305.13612v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ACL 2023 (Findings)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.12868">arXiv:2305.12868</a> <span> [<a href="https://arxiv.org/pdf/2305.12868">pdf</a>, <a href="https://arxiv.org/format/2305.12868">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> NAS-FM: Neural Architecture Search for Tunable and Interpretable Sound Synthesis based on Frequency Modulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ye%2C+Z">Zhen Ye</a>, <a href="/search/eess?searchtype=author&query=Xue%2C+W">Wei Xue</a>, <a href="/search/eess?searchtype=author&query=Tan%2C+X">Xu Tan</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+Q">Qifeng Liu</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+Y">Yike Guo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.12868v1-abstract-short" style="display: inline;"> Developing digital sound synthesizers is crucial to the music industry as it provides a low-cost way to produce high-quality sounds with rich timbres. Existing traditional synthesizers often require substantial expertise to determine the overall framework of a synthesizer and the parameters of submodules. Since expert knowledge is hard to acquire, it hinders the flexibility to quickly design and t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.12868v1-abstract-full').style.display = 'inline'; document.getElementById('2305.12868v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.12868v1-abstract-full" style="display: none;"> Developing digital sound synthesizers is crucial to the music industry as it provides a low-cost way to produce high-quality sounds with rich timbres. Existing traditional synthesizers often require substantial expertise to determine the overall framework of a synthesizer and the parameters of submodules. Since expert knowledge is hard to acquire, it hinders the flexibility to quickly design and tune digital synthesizers for diverse sounds. In this paper, we propose ``NAS-FM'', which adopts neural architecture search (NAS) to build a differentiable frequency modulation (FM) synthesizer. Tunable synthesizers with interpretable controls can be developed automatically from sounds without any prior expert knowledge and manual operating costs. In detail, we train a supernet with a specifically designed search space, including predicting the envelopes of carriers and modulators with different frequency ratios. An evolutionary search algorithm with adaptive oscillator size is then developed to find the optimal relationship between oscillators and the frequency ratio of FM. Extensive experiments on recordings of different instrument sounds show that our algorithm can build a synthesizer fully automatically, achieving better results than handcrafted synthesizers. Audio samples are available at https://nas-fm.github.io/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.12868v1-abstract-full').style.display = 'none'; document.getElementById('2305.12868v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> IJCAI 2023 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.10763">arXiv:2305.10763</a> <span> [<a href="https://arxiv.org/pdf/2305.10763">pdf</a>, <a href="https://arxiv.org/format/2305.10763">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> CLAPSpeech: Learning Prosody from Text Context with Contrastive Language-Audio Pre-training </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ye%2C+Z">Zhenhui Ye</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+R">Rongjie Huang</a>, <a href="/search/eess?searchtype=author&query=Ren%2C+Y">Yi Ren</a>, <a href="/search/eess?searchtype=author&query=Jiang%2C+Z">Ziyue Jiang</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+J">Jinglin Liu</a>, <a href="/search/eess?searchtype=author&query=He%2C+J">Jinzheng He</a>, <a href="/search/eess?searchtype=author&query=Yin%2C+X">Xiang Yin</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+Z">Zhou Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.10763v1-abstract-short" style="display: inline;"> Improving text representation has attracted much attention to achieve expressive text-to-speech (TTS). However, existing works only implicitly learn the prosody with masked token reconstruction tasks, which leads to low training efficiency and difficulty in prosody modeling. We propose CLAPSpeech, a cross-modal contrastive pre-training framework that explicitly learns the prosody variance of the s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.10763v1-abstract-full').style.display = 'inline'; document.getElementById('2305.10763v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.10763v1-abstract-full" style="display: none;"> Improving text representation has attracted much attention to achieve expressive text-to-speech (TTS). However, existing works only implicitly learn the prosody with masked token reconstruction tasks, which leads to low training efficiency and difficulty in prosody modeling. We propose CLAPSpeech, a cross-modal contrastive pre-training framework that explicitly learns the prosody variance of the same text token under different contexts. Specifically, 1) We encourage the model to connect the text context with its corresponding prosody pattern in the joint multi-modal space with the elaborate design of the encoder inputs and contrastive loss; 2) We introduce a multi-scale pre-training pipeline to capture prosody patterns in multiple levels. We show how to incorporate CLAPSpeech into existing TTS models for better prosody. Experiments on three datasets not only show that CLAPSpeech could improve the prosody prediction for existing TTS methods, but also demonstrate its generalization ability to adapt to multiple languages and multi-speaker TTS. We also deeply analyze the principle behind the performance of CLAPSpeech. Ablation studies demonstrate the necessity of each component in our method. Source code and audio samples are available at https://clapspeech.github.io. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.10763v1-abstract-full').style.display = 'none'; document.getElementById('2305.10763v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ACL 2023 (Main Conference)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.10686">arXiv:2305.10686</a> <span> [<a href="https://arxiv.org/pdf/2305.10686">pdf</a>, <a href="https://arxiv.org/format/2305.10686">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> RMSSinger: Realistic-Music-Score based Singing Voice Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=He%2C+J">Jinzheng He</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+J">Jinglin Liu</a>, <a href="/search/eess?searchtype=author&query=Ye%2C+Z">Zhenhui Ye</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+R">Rongjie Huang</a>, <a href="/search/eess?searchtype=author&query=Cui%2C+C">Chenye Cui</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+H">Huadai Liu</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+Z">Zhou Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.10686v1-abstract-short" style="display: inline;"> We are interested in a challenging task, Realistic-Music-Score based Singing Voice Synthesis (RMS-SVS). RMS-SVS aims to generate high-quality singing voices given realistic music scores with different note types (grace, slur, rest, etc.). Though significant progress has been achieved, recent singing voice synthesis (SVS) methods are limited to fine-grained music scores, which require a complicated… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.10686v1-abstract-full').style.display = 'inline'; document.getElementById('2305.10686v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.10686v1-abstract-full" style="display: none;"> We are interested in a challenging task, Realistic-Music-Score based Singing Voice Synthesis (RMS-SVS). RMS-SVS aims to generate high-quality singing voices given realistic music scores with different note types (grace, slur, rest, etc.). Though significant progress has been achieved, recent singing voice synthesis (SVS) methods are limited to fine-grained music scores, which require a complicated data collection pipeline with time-consuming manual annotation to align music notes with phonemes. Furthermore, these manual annotation destroys the regularity of note durations in music scores, making fine-grained music scores inconvenient for composing. To tackle these challenges, we propose RMSSinger, the first RMS-SVS method, which takes realistic music scores as input, eliminating most of the tedious manual annotation and avoiding the aforementioned inconvenience. Note that music scores are based on words rather than phonemes, in RMSSinger, we introduce word-level modeling to avoid the time-consuming phoneme duration annotation and the complicated phoneme-level mel-note alignment. Furthermore, we propose the first diffusion-based pitch modeling method, which ameliorates the naturalness of existing pitch-modeling methods. To achieve these, we collect a new dataset containing realistic music scores and singing voices according to these realistic music scores from professional singers. Extensive experiments on the dataset demonstrate the effectiveness of our methods. Audio samples are available at https://rmssinger.github.io/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.10686v1-abstract-full').style.display = 'none'; document.getElementById('2305.10686v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by Finding of ACL2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.06908">arXiv:2305.06908</a> <span> [<a href="https://arxiv.org/pdf/2305.06908">pdf</a>, <a href="https://arxiv.org/format/2305.06908">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> CoMoSpeech: One-Step Speech and Singing Voice Synthesis via Consistency Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ye%2C+Z">Zhen Ye</a>, <a href="/search/eess?searchtype=author&query=Xue%2C+W">Wei Xue</a>, <a href="/search/eess?searchtype=author&query=Tan%2C+X">Xu Tan</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+J">Jie Chen</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+Q">Qifeng Liu</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+Y">Yike Guo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.06908v4-abstract-short" style="display: inline;"> Denoising diffusion probabilistic models (DDPMs) have shown promising performance for speech synthesis. However, a large number of iterative steps are required to achieve high sample quality, which restricts the inference speed. Maintaining sample quality while increasing sampling speed has become a challenging task. In this paper, we propose a "Co"nsistency "Mo"del-based "Speech" synthesis method… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.06908v4-abstract-full').style.display = 'inline'; document.getElementById('2305.06908v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.06908v4-abstract-full" style="display: none;"> Denoising diffusion probabilistic models (DDPMs) have shown promising performance for speech synthesis. However, a large number of iterative steps are required to achieve high sample quality, which restricts the inference speed. Maintaining sample quality while increasing sampling speed has become a challenging task. In this paper, we propose a "Co"nsistency "Mo"del-based "Speech" synthesis method, CoMoSpeech, which achieve speech synthesis through a single diffusion sampling step while achieving high audio quality. The consistency constraint is applied to distill a consistency model from a well-designed diffusion-based teacher model, which ultimately yields superior performances in the distilled CoMoSpeech. Our experiments show that by generating audio recordings by a single sampling step, the CoMoSpeech achieves an inference speed more than 150 times faster than real-time on a single NVIDIA A100 GPU, which is comparable to FastSpeech2, making diffusion-sampling based speech synthesis truly practical. Meanwhile, objective and subjective evaluations on text-to-speech and singing voice synthesis show that the proposed teacher models yield the best audio quality, and the one-step sampling based CoMoSpeech achieves the best inference speed with better or comparable audio quality to other conventional multi-step diffusion model baselines. Audio samples are available at https://comospeech.github.io/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.06908v4-abstract-full').style.display = 'none'; document.getElementById('2305.06908v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ACM MM 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2304.12995">arXiv:2304.12995</a> <span> [<a href="https://arxiv.org/pdf/2304.12995">pdf</a>, <a href="https://arxiv.org/format/2304.12995">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> AudioGPT: Understanding and Generating Speech, Music, Sound, and Talking Head </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Huang%2C+R">Rongjie Huang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+M">Mingze Li</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+D">Dongchao Yang</a>, <a href="/search/eess?searchtype=author&query=Shi%2C+J">Jiatong Shi</a>, <a href="/search/eess?searchtype=author&query=Chang%2C+X">Xuankai Chang</a>, <a href="/search/eess?searchtype=author&query=Ye%2C+Z">Zhenhui Ye</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+Y">Yuning Wu</a>, <a href="/search/eess?searchtype=author&query=Hong%2C+Z">Zhiqing Hong</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+J">Jiawei Huang</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+J">Jinglin Liu</a>, <a href="/search/eess?searchtype=author&query=Ren%2C+Y">Yi Ren</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+Z">Zhou Zhao</a>, <a href="/search/eess?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2304.12995v1-abstract-short" style="display: inline;"> Large language models (LLMs) have exhibited remarkable capabilities across a variety of domains and tasks, challenging our understanding of learning and cognition. Despite the recent success, current LLMs are not capable of processing complex audio information or conducting spoken conversations (like Siri or Alexa). In this work, we propose a multi-modal AI system named AudioGPT, which complements… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.12995v1-abstract-full').style.display = 'inline'; document.getElementById('2304.12995v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2304.12995v1-abstract-full" style="display: none;"> Large language models (LLMs) have exhibited remarkable capabilities across a variety of domains and tasks, challenging our understanding of learning and cognition. Despite the recent success, current LLMs are not capable of processing complex audio information or conducting spoken conversations (like Siri or Alexa). In this work, we propose a multi-modal AI system named AudioGPT, which complements LLMs (i.e., ChatGPT) with 1) foundation models to process complex audio information and solve numerous understanding and generation tasks; and 2) the input/output interface (ASR, TTS) to support spoken dialogue. With an increasing demand to evaluate multi-modal LLMs of human intention understanding and cooperation with foundation models, we outline the principles and processes and test AudioGPT in terms of consistency, capability, and robustness. Experimental results demonstrate the capabilities of AudioGPT in solving AI tasks with speech, music, sound, and talking head understanding and generation in multi-round dialogues, which empower humans to create rich and diverse audio content with unprecedented ease. Our system is publicly available at \url{https://github.com/AIGC-Audio/AudioGPT}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.12995v1-abstract-full').style.display = 'none'; document.getElementById('2304.12995v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 April, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2302.00240">arXiv:2302.00240</a> <span> [<a href="https://arxiv.org/pdf/2302.00240">pdf</a>, <a href="https://arxiv.org/format/2302.00240">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Combinatorics">math.CO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> </div> </div> <p class="title is-5 mathjax"> Toward Efficient Transportation Electrification of Heavy-Duty Trucks: Joint Scheduling of Truck Routing and Charging </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Bragin%2C+M+A">Mikhail A. Bragin</a>, <a href="/search/eess?searchtype=author&query=Ye%2C+Z">Zuzhao Ye</a>, <a href="/search/eess?searchtype=author&query=Yu%2C+N">Nanpeng Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2302.00240v4-abstract-short" style="display: inline;"> The timely transportation of goods to customers is an essential component of economic activities. However, heavy-duty diesel trucks used for goods delivery significantly contribute to greenhouse gas emissions within many large metropolitan areas, including Los Angeles, New York, and San Francisco. To reduce GHG emissions by facilitating freight electrification, this paper proposes Joint Routing an… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.00240v4-abstract-full').style.display = 'inline'; document.getElementById('2302.00240v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2302.00240v4-abstract-full" style="display: none;"> The timely transportation of goods to customers is an essential component of economic activities. However, heavy-duty diesel trucks used for goods delivery significantly contribute to greenhouse gas emissions within many large metropolitan areas, including Los Angeles, New York, and San Francisco. To reduce GHG emissions by facilitating freight electrification, this paper proposes Joint Routing and Charging scheduling for electric trucks. The objective of the associated optimization problem is to minimize the cost of transportation, charging, and tardiness. A large number of possible combinations of road segments as well as a large number of combinations of charging decisions and charging durations leads to a combinatorial explosion in the possible decisions electric trucks can make. The resulting mixed-integer linear programming problem is thus extremely challenging because of the combinatorial complexity even in the deterministic case. Therefore, a Surrogate Level-Based Lagrangian Relaxation (SLBLR) method is employed to decompose the overall problem into significantly less complex truck subproblems. In the coordination aspect, each truck subproblem is solved independently of other subproblems based on the values of Lagrangian multipliers. In addition to serving as a means of guiding and coordinating trucks, multipliers can also serve as a basis for transparent and explanatory decision-making by trucks. Testing results demonstrate that even small instances cannot be solved using the off-the-shelf solver CPLEX after several days of solving. The SLBLR method, on the other hand, can obtain near-optimal solutions within a few minutes for small cases, and within 30 minutes for large ones. Furthermore, it has been demonstrated that as battery capacity increases, the total cost decreases significantly; moreover, as the charging power increases, the number of trucks required decreases as well. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.00240v4-abstract-full').style.display = 'none'; document.getElementById('2302.00240v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 January, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2301.12661">arXiv:2301.12661</a> <span> [<a href="https://arxiv.org/pdf/2301.12661">pdf</a>, <a href="https://arxiv.org/format/2301.12661">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Make-An-Audio: Text-To-Audio Generation with Prompt-Enhanced Diffusion Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Huang%2C+R">Rongjie Huang</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+J">Jiawei Huang</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+D">Dongchao Yang</a>, <a href="/search/eess?searchtype=author&query=Ren%2C+Y">Yi Ren</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+L">Luping Liu</a>, <a href="/search/eess?searchtype=author&query=Li%2C+M">Mingze Li</a>, <a href="/search/eess?searchtype=author&query=Ye%2C+Z">Zhenhui Ye</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+J">Jinglin Liu</a>, <a href="/search/eess?searchtype=author&query=Yin%2C+X">Xiang Yin</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+Z">Zhou Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2301.12661v1-abstract-short" style="display: inline;"> Large-scale multimodal generative modeling has created milestones in text-to-image and text-to-video generation. Its application to audio still lags behind for two main reasons: the lack of large-scale datasets with high-quality text-audio pairs, and the complexity of modeling long continuous audio data. In this work, we propose Make-An-Audio with a prompt-enhanced diffusion model that addresses t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2301.12661v1-abstract-full').style.display = 'inline'; document.getElementById('2301.12661v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2301.12661v1-abstract-full" style="display: none;"> Large-scale multimodal generative modeling has created milestones in text-to-image and text-to-video generation. Its application to audio still lags behind for two main reasons: the lack of large-scale datasets with high-quality text-audio pairs, and the complexity of modeling long continuous audio data. In this work, we propose Make-An-Audio with a prompt-enhanced diffusion model that addresses these gaps by 1) introducing pseudo prompt enhancement with a distill-then-reprogram approach, it alleviates data scarcity with orders of magnitude concept compositions by using language-free audios; 2) leveraging spectrogram autoencoder to predict the self-supervised audio representation instead of waveforms. Together with robust contrastive language-audio pretraining (CLAP) representations, Make-An-Audio achieves state-of-the-art results in both objective and subjective benchmark evaluation. Moreover, we present its controllability and generalization for X-to-Audio with "No Modality Left Behind", for the first time unlocking the ability to generate high-definition, high-fidelity audios given a user-defined modality input. Audio samples are available at https://Text-to-Audio.github.io <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2301.12661v1-abstract-full').style.display = 'none'; document.getElementById('2301.12661v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 January, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Audio samples are available at https://Text-to-Audio.github.io</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2301.00776">arXiv:2301.00776</a> <span> [<a href="https://arxiv.org/pdf/2301.00776">pdf</a>, <a href="https://arxiv.org/format/2301.00776">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Physics-Informed Neural Networks for Prognostics and Health Management of Lithium-Ion Batteries </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wen%2C+P">Pengfei Wen</a>, <a href="/search/eess?searchtype=author&query=Ye%2C+Z">Zhi-Sheng Ye</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yong Li</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+S">Shaowei Chen</a>, <a href="/search/eess?searchtype=author&query=Xie%2C+P">Pu Xie</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+S">Shuai Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2301.00776v2-abstract-short" style="display: inline;"> For Prognostics and Health Management (PHM) of Lithium-ion (Li-ion) batteries, many models have been established to characterize their degradation process. The existing empirical or physical models can reveal important information regarding the degradation dynamics. However, there are no general and flexible methods to fuse the information represented by those models. Physics-Informed Neural Netwo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2301.00776v2-abstract-full').style.display = 'inline'; document.getElementById('2301.00776v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2301.00776v2-abstract-full" style="display: none;"> For Prognostics and Health Management (PHM) of Lithium-ion (Li-ion) batteries, many models have been established to characterize their degradation process. The existing empirical or physical models can reveal important information regarding the degradation dynamics. However, there are no general and flexible methods to fuse the information represented by those models. Physics-Informed Neural Network (PINN) is an efficient tool to fuse empirical or physical dynamic models with data-driven models. To take full advantage of various information sources, we propose a model fusion scheme based on PINN. It is implemented by developing a semi-empirical semi-physical Partial Differential Equation (PDE) to model the degradation dynamics of Li-ion batteries. When there is little prior knowledge about the dynamics, we leverage the data-driven Deep Hidden Physics Model (DeepHPM) to discover the underlying governing dynamic models. The uncovered dynamics information is then fused with that mined by the surrogate neural network in the PINN framework. Moreover, an uncertainty-based adaptive weighting method is employed to balance the multiple learning tasks when training the PINN. The proposed methods are verified on a public dataset of Li-ion Phosphate (LFP)/graphite batteries. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2301.00776v2-abstract-full').style.display = 'none'; document.getElementById('2301.00776v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 January, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages, 10 figures</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Ye%2C+Z&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Ye%2C+Z&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Ye%2C+Z&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository