CINXE.COM

Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1&ndash;50 of 51 results for author: <span class="mathjax">Fu, R</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> <div class="content"> <form method="GET" action="/search/eess" aria-role="search"> Searching in archive <strong>eess</strong>. <a href="/search/?searchtype=author&amp;query=Fu%2C+R">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Fu, R"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Fu%2C+R&amp;terms-0-field=author&amp;size=50&amp;order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Fu, R"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Fu%2C+R&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Fu%2C+R&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Fu%2C+R&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.11909">arXiv:2409.11909</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.11909">pdf</a>, <a href="https://arxiv.org/format/2409.11909">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Mixture of Experts Fusion for Fake Audio Detection Using Frozen wav2vec 2.0 </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Wang%2C+Z">Zhiyong Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Fu%2C+R">Ruibo Fu</a>, <a href="/search/eess?searchtype=author&amp;query=Wen%2C+Z">Zhengqi Wen</a>, <a href="/search/eess?searchtype=author&amp;query=Tao%2C+J">Jianhua Tao</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+X">Xiaopeng Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Xie%2C+Y">Yuankun Xie</a>, <a href="/search/eess?searchtype=author&amp;query=Qi%2C+X">Xin Qi</a>, <a href="/search/eess?searchtype=author&amp;query=Shi%2C+S">Shuchen Shi</a>, <a href="/search/eess?searchtype=author&amp;query=Lu%2C+Y">Yi Lu</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+Y">Yukun Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+C">Chenxing Li</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+X">Xuefei Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+G">Guanjun Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.11909v1-abstract-short" style="display: inline;"> Speech synthesis technology has posed a serious threat to speaker verification systems. Currently, the most effective fake audio detection methods utilize pretrained models, and integrating features from various layers of pretrained model further enhances detection performance. However, most of the previously proposed fusion methods require fine-tuning the pretrained models, resulting in exces&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.11909v1-abstract-full').style.display = 'inline'; document.getElementById('2409.11909v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.11909v1-abstract-full" style="display: none;"> Speech synthesis technology has posed a serious threat to speaker verification systems. Currently, the most effective fake audio detection methods utilize pretrained models, and integrating features from various layers of pretrained model further enhances detection performance. However, most of the previously proposed fusion methods require fine-tuning the pretrained models, resulting in excessively long training times and hindering model iteration when facing new speech synthesis technology. To address this issue, this paper proposes a feature fusion method based on the Mixture of Experts, which extracts and integrates features relevant to fake audio detection from layer features, guided by a gating network based on the last layer feature, while freezing the pretrained model. Experiments conducted on the ASVspoof2019 and ASVspoof2021 datasets demonstrate that the proposed method achieves competitive performance compared to those requiring fine-tuning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.11909v1-abstract-full').style.display = 'none'; document.getElementById('2409.11909v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">submitted to ICASSP2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.11835">arXiv:2409.11835</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.11835">pdf</a>, <a href="https://arxiv.org/format/2409.11835">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> DPI-TTS: Directional Patch Interaction for Fast-Converging and Style Temporal Modeling in Text-to-Speech </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Qi%2C+X">Xin Qi</a>, <a href="/search/eess?searchtype=author&amp;query=Fu%2C+R">Ruibo Fu</a>, <a href="/search/eess?searchtype=author&amp;query=Wen%2C+Z">Zhengqi Wen</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+T">Tao Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Qiang%2C+C">Chunyu Qiang</a>, <a href="/search/eess?searchtype=author&amp;query=Tao%2C+J">Jianhua Tao</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+C">Chenxing Li</a>, <a href="/search/eess?searchtype=author&amp;query=Lu%2C+Y">Yi Lu</a>, <a href="/search/eess?searchtype=author&amp;query=Shi%2C+S">Shuchen Shi</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+Z">Zhiyong Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+X">Xiaopeng Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Xie%2C+Y">Yuankun Xie</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+Y">Yukun Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+X">Xuefei Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+G">Guanjun Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.11835v1-abstract-short" style="display: inline;"> In recent years, speech diffusion models have advanced rapidly. Alongside the widely used U-Net architecture, transformer-based models such as the Diffusion Transformer (DiT) have also gained attention. However, current DiT speech models treat Mel spectrograms as general images, which overlooks the specific acoustic properties of speech. To address these limitations, we propose a method called Dir&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.11835v1-abstract-full').style.display = 'inline'; document.getElementById('2409.11835v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.11835v1-abstract-full" style="display: none;"> In recent years, speech diffusion models have advanced rapidly. Alongside the widely used U-Net architecture, transformer-based models such as the Diffusion Transformer (DiT) have also gained attention. However, current DiT speech models treat Mel spectrograms as general images, which overlooks the specific acoustic properties of speech. To address these limitations, we propose a method called Directional Patch Interaction for Text-to-Speech (DPI-TTS), which builds on DiT and achieves fast training without compromising accuracy. Notably, DPI-TTS employs a low-to-high frequency, frame-by-frame progressive inference approach that aligns more closely with acoustic properties, enhancing the naturalness of the generated speech. Additionally, we introduce a fine-grained style temporal modeling method that further improves speaker style similarity. Experimental results demonstrate that our method increases the training speed by nearly 2 times and significantly outperforms the baseline models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.11835v1-abstract-full').style.display = 'none'; document.getElementById('2409.11835v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to ICASSP2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.09381">arXiv:2409.09381</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.09381">pdf</a>, <a href="https://arxiv.org/format/2409.09381">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Text Prompt is Not Enough: Sound Event Enhanced Prompt Adapter for Target Style Audio Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Xiong%2C+C">Chenxu Xiong</a>, <a href="/search/eess?searchtype=author&amp;query=Fu%2C+R">Ruibo Fu</a>, <a href="/search/eess?searchtype=author&amp;query=Shi%2C+S">Shuchen Shi</a>, <a href="/search/eess?searchtype=author&amp;query=Wen%2C+Z">Zhengqi Wen</a>, <a href="/search/eess?searchtype=author&amp;query=Tao%2C+J">Jianhua Tao</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+T">Tao Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+C">Chenxing Li</a>, <a href="/search/eess?searchtype=author&amp;query=Qiang%2C+C">Chunyu Qiang</a>, <a href="/search/eess?searchtype=author&amp;query=Xie%2C+Y">Yuankun Xie</a>, <a href="/search/eess?searchtype=author&amp;query=Qi%2C+X">Xin Qi</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+G">Guanjun Li</a>, <a href="/search/eess?searchtype=author&amp;query=Yang%2C+Z">Zizheng Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.09381v1-abstract-short" style="display: inline;"> Current mainstream audio generation methods primarily rely on simple text prompts, often failing to capture the nuanced details necessary for multi-style audio generation. To address this limitation, the Sound Event Enhanced Prompt Adapter is proposed. Unlike traditional static global style transfer, this method extracts style embedding through cross-attention between text and reference audio for&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09381v1-abstract-full').style.display = 'inline'; document.getElementById('2409.09381v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.09381v1-abstract-full" style="display: none;"> Current mainstream audio generation methods primarily rely on simple text prompts, often failing to capture the nuanced details necessary for multi-style audio generation. To address this limitation, the Sound Event Enhanced Prompt Adapter is proposed. Unlike traditional static global style transfer, this method extracts style embedding through cross-attention between text and reference audio for adaptive style control. Adaptive layer normalization is then utilized to enhance the model&#39;s capacity to express multiple styles. Additionally, the Sound Event Reference Style Transfer Dataset (SERST) is introduced for the proposed target style audio generation task, enabling dual-prompt audio generation using both text and audio references. Experimental results demonstrate the robustness of the model, achieving state-of-the-art Fr茅chet Distance of 26.94 and KL Divergence of 1.82, surpassing Tango, AudioLDM, and AudioGen. Furthermore, the generated audio shows high similarity to its corresponding audio reference. The demo, code, and dataset are publicly available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09381v1-abstract-full').style.display = 'none'; document.getElementById('2409.09381v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 2 figures, submitted to ICASSP 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.10853">arXiv:2408.10853</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.10853">pdf</a>, <a href="https://arxiv.org/format/2408.10853">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Does Current Deepfake Audio Detection Model Effectively Detect ALM-based Deepfake Audio? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Xie%2C+Y">Yuankun Xie</a>, <a href="/search/eess?searchtype=author&amp;query=Xiong%2C+C">Chenxu Xiong</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+X">Xiaopeng Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+Z">Zhiyong Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Lu%2C+Y">Yi Lu</a>, <a href="/search/eess?searchtype=author&amp;query=Qi%2C+X">Xin Qi</a>, <a href="/search/eess?searchtype=author&amp;query=Fu%2C+R">Ruibo Fu</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+Y">Yukun Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Wen%2C+Z">Zhengqi Wen</a>, <a href="/search/eess?searchtype=author&amp;query=Tao%2C+J">Jianhua Tao</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+G">Guanjun Li</a>, <a href="/search/eess?searchtype=author&amp;query=Ye%2C+L">Long Ye</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.10853v1-abstract-short" style="display: inline;"> Currently, Audio Language Models (ALMs) are rapidly advancing due to the developments in large language models and audio neural codecs. These ALMs have significantly lowered the barrier to creating deepfake audio, generating highly realistic and diverse types of deepfake audio, which pose severe threats to society. Consequently, effective audio deepfake detection technologies to detect ALM-based a&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10853v1-abstract-full').style.display = 'inline'; document.getElementById('2408.10853v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.10853v1-abstract-full" style="display: none;"> Currently, Audio Language Models (ALMs) are rapidly advancing due to the developments in large language models and audio neural codecs. These ALMs have significantly lowered the barrier to creating deepfake audio, generating highly realistic and diverse types of deepfake audio, which pose severe threats to society. Consequently, effective audio deepfake detection technologies to detect ALM-based audio have become increasingly critical. This paper investigate the effectiveness of current countermeasure (CM) against ALM-based audio. Specifically, we collect 12 types of the latest ALM-based deepfake audio and utilizing the latest CMs to evaluate. Our findings reveal that the latest codec-trained CM can effectively detect ALM-based audio, achieving 0% equal error rate under most ALM test conditions, which exceeded our expectations. This indicates promising directions for future research in ALM-based deepfake audio detection. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10853v1-abstract-full').style.display = 'none'; document.getElementById('2408.10853v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.10852">arXiv:2408.10852</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.10852">pdf</a>, <a href="https://arxiv.org/format/2408.10852">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> EELE: Exploring Efficient and Extensible LoRA Integration in Emotional Text-to-Speech </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Qi%2C+X">Xin Qi</a>, <a href="/search/eess?searchtype=author&amp;query=Fu%2C+R">Ruibo Fu</a>, <a href="/search/eess?searchtype=author&amp;query=Wen%2C+Z">Zhengqi Wen</a>, <a href="/search/eess?searchtype=author&amp;query=Tao%2C+J">Jianhua Tao</a>, <a href="/search/eess?searchtype=author&amp;query=Shi%2C+S">Shuchen Shi</a>, <a href="/search/eess?searchtype=author&amp;query=Lu%2C+Y">Yi Lu</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+Z">Zhiyong Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+X">Xiaopeng Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Xie%2C+Y">Yuankun Xie</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+Y">Yukun Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+G">Guanjun Li</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+X">Xuefei Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+Y">Yongwei Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.10852v1-abstract-short" style="display: inline;"> In the current era of Artificial Intelligence Generated Content (AIGC), a Low-Rank Adaptation (LoRA) method has emerged. It uses a plugin-based approach to learn new knowledge with lower parameter quantities and computational costs, and it can be plugged in and out based on the specific sub-tasks, offering high flexibility. However, the current application schemes primarily incorporate LoRA into t&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10852v1-abstract-full').style.display = 'inline'; document.getElementById('2408.10852v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.10852v1-abstract-full" style="display: none;"> In the current era of Artificial Intelligence Generated Content (AIGC), a Low-Rank Adaptation (LoRA) method has emerged. It uses a plugin-based approach to learn new knowledge with lower parameter quantities and computational costs, and it can be plugged in and out based on the specific sub-tasks, offering high flexibility. However, the current application schemes primarily incorporate LoRA into the pre-introduced conditional parts of the speech models. This fixes the position of LoRA, limiting the flexibility and scalability of its application. Therefore, we propose the Exploring Efficient and Extensible LoRA Integration in Emotional Text-to-Speech (EELE) method. Starting from a general neutral speech model, we do not pre-introduce emotional information but instead use the LoRA plugin to design a flexible adaptive scheme that endows the model with emotional generation capabilities. Specifically, we initially train the model using only neutral speech data. After training is complete, we insert LoRA into different modules and fine-tune the model with emotional speech data to find the optimal insertion scheme. Through experiments, we compare and test the effects of inserting LoRA at different positions within the model and assess LoRA&#39;s ability to learn various emotions, effectively proving the validity of our method. Additionally, we explore the impact of the rank size of LoRA and the difference compared to directly fine-tuning the entire model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10852v1-abstract-full').style.display = 'none'; document.getElementById('2408.10852v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.10849">arXiv:2408.10849</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.10849">pdf</a>, <a href="https://arxiv.org/format/2408.10849">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> A Noval Feature via Color Quantisation for Fake Audio Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Wang%2C+Z">Zhiyong Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+X">Xiaopeng Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Xie%2C+Y">Yuankun Xie</a>, <a href="/search/eess?searchtype=author&amp;query=Fu%2C+R">Ruibo Fu</a>, <a href="/search/eess?searchtype=author&amp;query=Wen%2C+Z">Zhengqi Wen</a>, <a href="/search/eess?searchtype=author&amp;query=Tao%2C+J">Jianhua Tao</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+Y">Yukun Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+G">Guanjun Li</a>, <a href="/search/eess?searchtype=author&amp;query=Qi%2C+X">Xin Qi</a>, <a href="/search/eess?searchtype=author&amp;query=Lu%2C+Y">Yi Lu</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+X">Xuefei Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+Y">Yongwei Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.10849v1-abstract-short" style="display: inline;"> In the field of deepfake detection, previous studies focus on using reconstruction or mask and prediction methods to train pre-trained models, which are then transferred to fake audio detection training where the encoder is used to extract features, such as wav2vec2.0 and Masked Auto Encoder. These methods have proven that using real audio for reconstruction pre-training can better help the model&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10849v1-abstract-full').style.display = 'inline'; document.getElementById('2408.10849v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.10849v1-abstract-full" style="display: none;"> In the field of deepfake detection, previous studies focus on using reconstruction or mask and prediction methods to train pre-trained models, which are then transferred to fake audio detection training where the encoder is used to extract features, such as wav2vec2.0 and Masked Auto Encoder. These methods have proven that using real audio for reconstruction pre-training can better help the model distinguish fake audio. However, the disadvantage lies in poor interpretability, meaning it is hard to intuitively present the differences between deepfake and real audio. This paper proposes a noval feature extraction method via color quantisation which constrains the reconstruction to use a limited number of colors for the spectral image-like input. The proposed method ensures reconstructed input differs from the original, which allows for intuitive observation of the focus areas in the spectral reconstruction. Experiments conducted on the ASVspoof2019 dataset demonstrate that the proposed method achieves better classification performance compared to using the original spectral as input and pretraining the recolor network can also benefit the fake audio detection. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10849v1-abstract-full').style.display = 'none'; document.getElementById('2408.10849v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted by ISCSLP2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.06922">arXiv:2408.06922</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.06922">pdf</a>, <a href="https://arxiv.org/format/2408.06922">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Temporal Variability and Multi-Viewed Self-Supervised Representations to Tackle the ASVspoof5 Deepfake Challenge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Xie%2C+Y">Yuankun Xie</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+X">Xiaopeng Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+Z">Zhiyong Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Fu%2C+R">Ruibo Fu</a>, <a href="/search/eess?searchtype=author&amp;query=Wen%2C+Z">Zhengqi Wen</a>, <a href="/search/eess?searchtype=author&amp;query=Cheng%2C+H">Haonan Cheng</a>, <a href="/search/eess?searchtype=author&amp;query=Ye%2C+L">Long Ye</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.06922v1-abstract-short" style="display: inline;"> ASVspoof5, the fifth edition of the ASVspoof series, is one of the largest global audio security challenges. It aims to advance the development of countermeasure (CM) to discriminate bonafide and spoofed speech utterances. In this paper, we focus on addressing the problem of open-domain audio deepfake detection, which corresponds directly to the ASVspoof5 Track1 open condition. At first, we compre&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.06922v1-abstract-full').style.display = 'inline'; document.getElementById('2408.06922v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.06922v1-abstract-full" style="display: none;"> ASVspoof5, the fifth edition of the ASVspoof series, is one of the largest global audio security challenges. It aims to advance the development of countermeasure (CM) to discriminate bonafide and spoofed speech utterances. In this paper, we focus on addressing the problem of open-domain audio deepfake detection, which corresponds directly to the ASVspoof5 Track1 open condition. At first, we comprehensively investigate various CM on ASVspoof5, including data expansion, data augmentation, and self-supervised learning (SSL) features. Due to the high-frequency gaps characteristic of the ASVspoof5 dataset, we introduce Frequency Mask, a data augmentation method that masks specific frequency bands to improve CM robustness. Combining various scale of temporal information with multiple SSL features, our experiments achieved a minDCF of 0.0158 and an EER of 0.55% on the ASVspoof 5 Track 1 evaluation progress set. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.06922v1-abstract-full').style.display = 'none'; document.getElementById('2408.06922v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.05758">arXiv:2408.05758</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.05758">pdf</a>, <a href="https://arxiv.org/format/2408.05758">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> VQ-CTAP: Cross-Modal Fine-Grained Sequence Representation Learning for Speech Processing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Qiang%2C+C">Chunyu Qiang</a>, <a href="/search/eess?searchtype=author&amp;query=Geng%2C+W">Wang Geng</a>, <a href="/search/eess?searchtype=author&amp;query=Zhao%2C+Y">Yi Zhao</a>, <a href="/search/eess?searchtype=author&amp;query=Fu%2C+R">Ruibo Fu</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+T">Tao Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Gong%2C+C">Cheng Gong</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+T">Tianrui Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+Q">Qiuyu Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Yi%2C+J">Jiangyan Yi</a>, <a href="/search/eess?searchtype=author&amp;query=Wen%2C+Z">Zhengqi Wen</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+C">Chen Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Che%2C+H">Hao Che</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+L">Longbiao Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Dang%2C+J">Jianwu Dang</a>, <a href="/search/eess?searchtype=author&amp;query=Tao%2C+J">Jianhua Tao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.05758v1-abstract-short" style="display: inline;"> Deep learning has brought significant improvements to the field of cross-modal representation learning. For tasks such as text-to-speech (TTS), voice conversion (VC), and automatic speech recognition (ASR), a cross-modal fine-grained (frame-level) sequence representation is desired, emphasizing the semantic content of the text modality while de-emphasizing the paralinguistic information of the spe&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.05758v1-abstract-full').style.display = 'inline'; document.getElementById('2408.05758v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.05758v1-abstract-full" style="display: none;"> Deep learning has brought significant improvements to the field of cross-modal representation learning. For tasks such as text-to-speech (TTS), voice conversion (VC), and automatic speech recognition (ASR), a cross-modal fine-grained (frame-level) sequence representation is desired, emphasizing the semantic content of the text modality while de-emphasizing the paralinguistic information of the speech modality. We propose a method called &#34;Vector Quantized Contrastive Token-Acoustic Pre-training (VQ-CTAP)&#34;, which uses the cross-modal aligned sequence transcoder to bring text and speech into a joint multimodal space, learning how to connect text and speech at the frame level. The proposed VQ-CTAP is a paradigm for cross-modal sequence representation learning, offering a promising solution for fine-grained generation and recognition tasks in speech processing. The VQ-CTAP can be directly applied to VC and ASR tasks without fine-tuning or additional structures. We propose a sequence-aware semantic connector, which connects multiple frozen pre-trained modules for the TTS task, exhibiting a plug-and-play capability. We design a stepping optimization strategy to ensure effective model convergence by gradually injecting and adjusting the influence of various loss components. Furthermore, we propose a semantic-transfer-wise paralinguistic consistency loss to enhance representational capabilities, allowing the model to better generalize to unseen data and capture the nuances of paralinguistic information. In addition, VQ-CTAP achieves high-compression speech coding at a rate of 25Hz from 24kHz input waveforms, which is a 960-fold reduction in the sampling rate. The audio demo is available at https://qiangchunyu.github.io/VQCTAP/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.05758v1-abstract-full').style.display = 'none'; document.getElementById('2408.05758v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.12038">arXiv:2407.12038</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.12038">pdf</a>, <a href="https://arxiv.org/ps/2407.12038">ps</a>, <a href="https://arxiv.org/format/2407.12038">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> ICAGC 2024: Inspirational and Convincing Audio Generation Challenge 2024 </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Fu%2C+R">Ruibo Fu</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+R">Rui Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Qiang%2C+C">Chunyu Qiang</a>, <a href="/search/eess?searchtype=author&amp;query=Gao%2C+Y">Yingming Gao</a>, <a href="/search/eess?searchtype=author&amp;query=Lu%2C+Y">Yi Lu</a>, <a href="/search/eess?searchtype=author&amp;query=Shi%2C+S">Shuchen Shi</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+T">Tao Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+Y">Ya Li</a>, <a href="/search/eess?searchtype=author&amp;query=Wen%2C+Z">Zhengqi Wen</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+C">Chen Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Bu%2C+H">Hui Bu</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+Y">Yukun Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Qi%2C+X">Xin Qi</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+G">Guanjun Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.12038v2-abstract-short" style="display: inline;"> The Inspirational and Convincing Audio Generation Challenge 2024 (ICAGC 2024) is part of the ISCSLP 2024 Competitions and Challenges track. While current text-to-speech (TTS) technology can generate high-quality audio, its ability to convey complex emotions and controlled detail content remains limited. This constraint leads to a discrepancy between the generated audio and human subjective percept&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.12038v2-abstract-full').style.display = 'inline'; document.getElementById('2407.12038v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.12038v2-abstract-full" style="display: none;"> The Inspirational and Convincing Audio Generation Challenge 2024 (ICAGC 2024) is part of the ISCSLP 2024 Competitions and Challenges track. While current text-to-speech (TTS) technology can generate high-quality audio, its ability to convey complex emotions and controlled detail content remains limited. This constraint leads to a discrepancy between the generated audio and human subjective perception in practical applications like companion robots for children and marketing bots. The core issue lies in the inconsistency between high-quality audio generation and the ultimate human subjective experience. Therefore, this challenge aims to enhance the persuasiveness and acceptability of synthesized audio, focusing on human alignment convincing and inspirational audio generation. A total of 19 teams have registered for the challenge, and the results of the competition and the competition are described in this paper. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.12038v2-abstract-full').style.display = 'none'; document.getElementById('2407.12038v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ISCSLP 2024 Challenge description and results</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.05421">arXiv:2407.05421</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.05421">pdf</a>, <a href="https://arxiv.org/format/2407.05421">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> ASRRL-TTS: Agile Speaker Representation Reinforcement Learning for Text-to-Speech Speaker Adaptation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Fu%2C+R">Ruibo Fu</a>, <a href="/search/eess?searchtype=author&amp;query=Qi%2C+X">Xin Qi</a>, <a href="/search/eess?searchtype=author&amp;query=Wen%2C+Z">Zhengqi Wen</a>, <a href="/search/eess?searchtype=author&amp;query=Tao%2C+J">Jianhua Tao</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+T">Tao Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Qiang%2C+C">Chunyu Qiang</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+Z">Zhiyong Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Lu%2C+Y">Yi Lu</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+X">Xiaopeng Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Shi%2C+S">Shuchen Shi</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+Y">Yukun Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+X">Xuefei Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+S">Shuai Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.05421v1-abstract-short" style="display: inline;"> Speaker adaptation, which involves cloning voices from unseen speakers in the Text-to-Speech task, has garnered significant interest due to its numerous applications in multi-media fields. Despite recent advancements, existing methods often struggle with inadequate speaker representation accuracy and overfitting, particularly in limited reference speeches scenarios. To address these challenges, we&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.05421v1-abstract-full').style.display = 'inline'; document.getElementById('2407.05421v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.05421v1-abstract-full" style="display: none;"> Speaker adaptation, which involves cloning voices from unseen speakers in the Text-to-Speech task, has garnered significant interest due to its numerous applications in multi-media fields. Despite recent advancements, existing methods often struggle with inadequate speaker representation accuracy and overfitting, particularly in limited reference speeches scenarios. To address these challenges, we propose an Agile Speaker Representation Reinforcement Learning strategy to enhance speaker similarity in speaker adaptation tasks. ASRRL is the first work to apply reinforcement learning to improve the modeling accuracy of speaker embeddings in speaker adaptation, addressing the challenge of decoupling voice content and timbre. Our approach introduces two action strategies tailored to different reference speeches scenarios. In the single-sentence scenario, a knowledge-oriented optimal routine searching RL method is employed to expedite the exploration and retrieval of refinement information on the fringe of speaker representations. In the few-sentence scenario, we utilize a dynamic RL method to adaptively fuse reference speeches, enhancing the robustness and accuracy of speaker modeling. To achieve optimal results in the target domain, a multi-scale fusion scoring mechanism based reward model that evaluates speaker similarity, speech quality, and intelligibility across three dimensions is proposed, ensuring that improvements in speaker similarity do not compromise speech quality or intelligibility. The experimental results on the LibriTTS and VCTK datasets within mainstream TTS frameworks demonstrate the extensibility and generalization capabilities of the proposed ASRRL method. The results indicate that the ASRRL method significantly outperforms traditional fine-tuning approaches, achieving higher speaker similarity and better overall speech quality with limited reference speeches. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.05421v1-abstract-full').style.display = 'none'; document.getElementById('2407.05421v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The audio demo is available at https://7xin.github.io/ASRRL/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.17801">arXiv:2406.17801</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.17801">pdf</a>, <a href="https://arxiv.org/format/2406.17801">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> A multi-speaker multi-lingual voice cloning system based on vits2 for limmits 2024 challenge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Wang%2C+X">Xiaopeng Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Lu%2C+Y">Yi Lu</a>, <a href="/search/eess?searchtype=author&amp;query=Qi%2C+X">Xin Qi</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+Z">Zhiyong Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Xie%2C+Y">Yuankun Xie</a>, <a href="/search/eess?searchtype=author&amp;query=Shi%2C+S">Shuchen Shi</a>, <a href="/search/eess?searchtype=author&amp;query=Fu%2C+R">Ruibo Fu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.17801v1-abstract-short" style="display: inline;"> This paper presents the development of a speech synthesis system for the LIMMITS&#39;24 Challenge, focusing primarily on Track 2. The objective of the challenge is to establish a multi-speaker, multi-lingual Indic Text-to-Speech system with voice cloning capabilities, covering seven Indian languages with both male and female speakers. The system was trained using challenge data and fine-tuned for few-&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.17801v1-abstract-full').style.display = 'inline'; document.getElementById('2406.17801v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.17801v1-abstract-full" style="display: none;"> This paper presents the development of a speech synthesis system for the LIMMITS&#39;24 Challenge, focusing primarily on Track 2. The objective of the challenge is to establish a multi-speaker, multi-lingual Indic Text-to-Speech system with voice cloning capabilities, covering seven Indian languages with both male and female speakers. The system was trained using challenge data and fine-tuned for few-shot voice cloning on target speakers. Evaluation included both mono-lingual and cross-lingual synthesis across all seven languages, with subjective tests assessing naturalness and speaker similarity. Our system uses the VITS2 architecture, augmented with a multi-lingual ID and a BERT model to enhance contextual language comprehension. In Track 1, where no additional data usage was permitted, our model achieved a Speaker Similarity score of 4.02. In Track 2, which allowed the use of extra data, it attained a Speaker Similarity score of 4.17. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.17801v1-abstract-full').style.display = 'none'; document.getElementById('2406.17801v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.10591">arXiv:2406.10591</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.10591">pdf</a>, <a href="https://arxiv.org/format/2406.10591">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> MINT: a Multi-modal Image and Narrative Text Dubbing Dataset for Foley Audio Content Planning and Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Fu%2C+R">Ruibo Fu</a>, <a href="/search/eess?searchtype=author&amp;query=Shi%2C+S">Shuchen Shi</a>, <a href="/search/eess?searchtype=author&amp;query=Guo%2C+H">Hongming Guo</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+T">Tao Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Qiang%2C+C">Chunyu Qiang</a>, <a href="/search/eess?searchtype=author&amp;query=Wen%2C+Z">Zhengqi Wen</a>, <a href="/search/eess?searchtype=author&amp;query=Tao%2C+J">Jianhua Tao</a>, <a href="/search/eess?searchtype=author&amp;query=Qi%2C+X">Xin Qi</a>, <a href="/search/eess?searchtype=author&amp;query=Lu%2C+Y">Yi Lu</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+X">Xiaopeng Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+Z">Zhiyong Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+Y">Yukun Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+X">Xuefei Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+S">Shuai Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+G">Guanjun Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.10591v1-abstract-short" style="display: inline;"> Foley audio, critical for enhancing the immersive experience in multimedia content, faces significant challenges in the AI-generated content (AIGC) landscape. Despite advancements in AIGC technologies for text and image generation, the foley audio dubbing remains rudimentary due to difficulties in cross-modal scene matching and content correlation. Current text-to-audio technology, which relies on&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.10591v1-abstract-full').style.display = 'inline'; document.getElementById('2406.10591v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.10591v1-abstract-full" style="display: none;"> Foley audio, critical for enhancing the immersive experience in multimedia content, faces significant challenges in the AI-generated content (AIGC) landscape. Despite advancements in AIGC technologies for text and image generation, the foley audio dubbing remains rudimentary due to difficulties in cross-modal scene matching and content correlation. Current text-to-audio technology, which relies on detailed and acoustically relevant textual descriptions, falls short in practical video dubbing applications. Existing datasets like AudioSet, AudioCaps, Clotho, Sound-of-Story, and WavCaps do not fully meet the requirements for real-world foley audio dubbing task. To address this, we introduce the Multi-modal Image and Narrative Text Dubbing Dataset (MINT), designed to enhance mainstream dubbing tasks such as literary story audiobooks dubbing, image/silent video dubbing. Besides, to address the limitations of existing TTA technology in understanding and planning complex prompts, a Foley Audio Content Planning, Generation, and Alignment (CPGA) framework is proposed, which includes a content planning module leveraging large language models for complex multi-modal prompts comprehension. Additionally, the training process is optimized using Proximal Policy Optimization based reinforcement learning, significantly improving the alignment and auditory realism of generated foley audio. Experimental results demonstrate that our approach significantly advances the field of foley audio dubbing, providing robust solutions for the challenges of multi-modal dubbing. Even when utilizing the relatively lightweight GPT-2 model, our framework outperforms open-source multimodal large models such as LLaVA, DeepSeek-VL, and Moondream2. The dataset is available at https://github.com/borisfrb/MINT . <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.10591v1-abstract-full').style.display = 'none'; document.getElementById('2406.10591v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.08112">arXiv:2406.08112</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.08112">pdf</a>, <a href="https://arxiv.org/format/2406.08112">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Codecfake: An Initial Dataset for Detecting LLM-based Deepfake Audio </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Lu%2C+Y">Yi Lu</a>, <a href="/search/eess?searchtype=author&amp;query=Xie%2C+Y">Yuankun Xie</a>, <a href="/search/eess?searchtype=author&amp;query=Fu%2C+R">Ruibo Fu</a>, <a href="/search/eess?searchtype=author&amp;query=Wen%2C+Z">Zhengqi Wen</a>, <a href="/search/eess?searchtype=author&amp;query=Tao%2C+J">Jianhua Tao</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+Z">Zhiyong Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Qi%2C+X">Xin Qi</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+X">Xuefei Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+Y">Yongwei Li</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+Y">Yukun Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+X">Xiaopeng Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Shi%2C+S">Shuchen Shi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.08112v1-abstract-short" style="display: inline;"> With the proliferation of Large Language Model (LLM) based deepfake audio, there is an urgent need for effective detection methods. Previous deepfake audio generation methods typically involve a multi-step generation process, with the final step using a vocoder to predict the waveform from handcrafted features. However, LLM-based audio is directly generated from discrete neural codecs in an end-to&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.08112v1-abstract-full').style.display = 'inline'; document.getElementById('2406.08112v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.08112v1-abstract-full" style="display: none;"> With the proliferation of Large Language Model (LLM) based deepfake audio, there is an urgent need for effective detection methods. Previous deepfake audio generation methods typically involve a multi-step generation process, with the final step using a vocoder to predict the waveform from handcrafted features. However, LLM-based audio is directly generated from discrete neural codecs in an end-to-end generation process, skipping the final step of vocoder processing. This poses a significant challenge for current audio deepfake detection (ADD) models based on vocoder artifacts. To effectively detect LLM-based deepfake audio, we focus on the core of the generation process, the conversion from neural codec to waveform. We propose Codecfake dataset, which is generated by seven representative neural codec methods. Experiment results show that codec-trained ADD models exhibit a 41.406% reduction in average equal error rate compared to vocoder-trained ADD models on the Codecfake test set. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.08112v1-abstract-full').style.display = 'none'; document.getElementById('2406.08112v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by INTERSPEECH 2024. arXiv admin note: substantial text overlap with arXiv:2405.04880</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.04683">arXiv:2406.04683</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.04683">pdf</a>, <a href="https://arxiv.org/format/2406.04683">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> PPPR: Portable Plug-in Prompt Refiner for Text to Audio Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Shi%2C+S">Shuchen Shi</a>, <a href="/search/eess?searchtype=author&amp;query=Fu%2C+R">Ruibo Fu</a>, <a href="/search/eess?searchtype=author&amp;query=Wen%2C+Z">Zhengqi Wen</a>, <a href="/search/eess?searchtype=author&amp;query=Tao%2C+J">Jianhua Tao</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+T">Tao Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Qiang%2C+C">Chunyu Qiang</a>, <a href="/search/eess?searchtype=author&amp;query=Lu%2C+Y">Yi Lu</a>, <a href="/search/eess?searchtype=author&amp;query=Qi%2C+X">Xin Qi</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+X">Xuefei Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+Y">Yukun Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+Y">Yongwei Li</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+Z">Zhiyong Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+X">Xiaopeng Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.04683v1-abstract-short" style="display: inline;"> Text-to-Audio (TTA) aims to generate audio that corresponds to the given text description, playing a crucial role in media production. The text descriptions in TTA datasets lack rich variations and diversity, resulting in a drop in TTA model performance when faced with complex text. To address this issue, we propose a method called Portable Plug-in Prompt Refiner, which utilizes rich knowledge abo&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.04683v1-abstract-full').style.display = 'inline'; document.getElementById('2406.04683v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.04683v1-abstract-full" style="display: none;"> Text-to-Audio (TTA) aims to generate audio that corresponds to the given text description, playing a crucial role in media production. The text descriptions in TTA datasets lack rich variations and diversity, resulting in a drop in TTA model performance when faced with complex text. To address this issue, we propose a method called Portable Plug-in Prompt Refiner, which utilizes rich knowledge about textual descriptions inherent in large language models to effectively enhance the robustness of TTA acoustic models without altering the acoustic training set. Furthermore, a Chain-of-Thought that mimics human verification is introduced to enhance the accuracy of audio descriptions, thereby improving the accuracy of generated content in practical applications. The experiments show that our method achieves a state-of-the-art Inception Score (IS) of 8.72, surpassing AudioGen, AudioLDM and Tango. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.04683v1-abstract-full').style.display = 'none'; document.getElementById('2406.04683v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted by INTERSPEECH2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.03247">arXiv:2406.03247</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.03247">pdf</a>, <a href="https://arxiv.org/format/2406.03247">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Genuine-Focused Learning using Mask AutoEncoder for Generalized Fake Audio Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Wang%2C+X">Xiaopeng Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Fu%2C+R">Ruibo Fu</a>, <a href="/search/eess?searchtype=author&amp;query=Wen%2C+Z">Zhengqi Wen</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+Z">Zhiyong Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Xie%2C+Y">Yuankun Xie</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+Y">Yukun Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Tao%2C+J">Jianhua Tao</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+X">Xuefei Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+Y">Yongwei Li</a>, <a href="/search/eess?searchtype=author&amp;query=Qi%2C+X">Xin Qi</a>, <a href="/search/eess?searchtype=author&amp;query=Lu%2C+Y">Yi Lu</a>, <a href="/search/eess?searchtype=author&amp;query=Shi%2C+S">Shuchen Shi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.03247v2-abstract-short" style="display: inline;"> The generalization of Fake Audio Detection (FAD) is critical due to the emergence of new spoofing techniques. Traditional FAD methods often focus solely on distinguishing between genuine and known spoofed audio. We propose a Genuine-Focused Learning (GFL) framework guided, aiming for highly generalized FAD, called GFL-FAD. This method incorporates a Counterfactual Reasoning Enhanced Representation&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.03247v2-abstract-full').style.display = 'inline'; document.getElementById('2406.03247v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.03247v2-abstract-full" style="display: none;"> The generalization of Fake Audio Detection (FAD) is critical due to the emergence of new spoofing techniques. Traditional FAD methods often focus solely on distinguishing between genuine and known spoofed audio. We propose a Genuine-Focused Learning (GFL) framework guided, aiming for highly generalized FAD, called GFL-FAD. This method incorporates a Counterfactual Reasoning Enhanced Representation (CRER) based on audio reconstruction using the Mask AutoEncoder (MAE) architecture to accurately model genuine audio features. To reduce the influence of spoofed audio during training, we introduce a genuine audio reconstruction loss, maintaining the focus on learning genuine data features. In addition, content-related bottleneck (BN) features are extracted from the MAE to supplement the knowledge of the original audio. These BN features are adaptively fused with CRER to further improve robustness. Our method achieves state-of-the-art performance with an EER of 0.25% on ASVspoof2019 LA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.03247v2-abstract-full').style.display = 'none'; document.getElementById('2406.03247v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by INTERSPEECH 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.03240">arXiv:2406.03240</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.03240">pdf</a>, <a href="https://arxiv.org/format/2406.03240">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Generalized Source Tracing: Detecting Novel Audio Deepfake Algorithm with Real Emphasis and Fake Dispersion Strategy </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Xie%2C+Y">Yuankun Xie</a>, <a href="/search/eess?searchtype=author&amp;query=Fu%2C+R">Ruibo Fu</a>, <a href="/search/eess?searchtype=author&amp;query=Wen%2C+Z">Zhengqi Wen</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+Z">Zhiyong Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+X">Xiaopeng Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Cheng%2C+H">Haonnan Cheng</a>, <a href="/search/eess?searchtype=author&amp;query=Ye%2C+L">Long Ye</a>, <a href="/search/eess?searchtype=author&amp;query=Tao%2C+J">Jianhua Tao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.03240v2-abstract-short" style="display: inline;"> With the proliferation of deepfake audio, there is an urgent need to investigate their attribution. Current source tracing methods can effectively distinguish in-distribution (ID) categories. However, the rapid evolution of deepfake algorithms poses a critical challenge in the accurate identification of out-of-distribution (OOD) novel deepfake algorithms. In this paper, we propose Real Emphasis an&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.03240v2-abstract-full').style.display = 'inline'; document.getElementById('2406.03240v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.03240v2-abstract-full" style="display: none;"> With the proliferation of deepfake audio, there is an urgent need to investigate their attribution. Current source tracing methods can effectively distinguish in-distribution (ID) categories. However, the rapid evolution of deepfake algorithms poses a critical challenge in the accurate identification of out-of-distribution (OOD) novel deepfake algorithms. In this paper, we propose Real Emphasis and Fake Dispersion (REFD) strategy for audio deepfake algorithm recognition, demonstrating its effectiveness in discriminating ID samples while identifying OOD samples. For effective OOD detection, we first explore current post-hoc OOD methods and propose NSD, a novel OOD approach in identifying novel deepfake algorithms through the similarity consideration of both feature and logits scores. REFD achieves 86.83% F1-score as a single system in Audio Deepfake Detection Challenge 2023 Track3, showcasing its state-of-the-art performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.03240v2-abstract-full').style.display = 'none'; document.getElementById('2406.03240v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by INTERSPEECH 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.03237">arXiv:2406.03237</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.03237">pdf</a>, <a href="https://arxiv.org/format/2406.03237">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Generalized Fake Audio Detection via Deep Stable Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Wang%2C+Z">Zhiyong Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Fu%2C+R">Ruibo Fu</a>, <a href="/search/eess?searchtype=author&amp;query=Wen%2C+Z">Zhengqi Wen</a>, <a href="/search/eess?searchtype=author&amp;query=Xie%2C+Y">Yuankun Xie</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+Y">Yukun Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+X">Xiaopeng Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+X">Xuefei Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+Y">Yongwei Li</a>, <a href="/search/eess?searchtype=author&amp;query=Tao%2C+J">Jianhua Tao</a>, <a href="/search/eess?searchtype=author&amp;query=Lu%2C+Y">Yi Lu</a>, <a href="/search/eess?searchtype=author&amp;query=Qi%2C+X">Xin Qi</a>, <a href="/search/eess?searchtype=author&amp;query=Shi%2C+S">Shuchen Shi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.03237v1-abstract-short" style="display: inline;"> Although current fake audio detection approaches have achieved remarkable success on specific datasets, they often fail when evaluated with datasets from different distributions. Previous studies typically address distribution shift by focusing on using extra data or applying extra loss restrictions during training. However, these methods either require a substantial amount of data or complicate t&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.03237v1-abstract-full').style.display = 'inline'; document.getElementById('2406.03237v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.03237v1-abstract-full" style="display: none;"> Although current fake audio detection approaches have achieved remarkable success on specific datasets, they often fail when evaluated with datasets from different distributions. Previous studies typically address distribution shift by focusing on using extra data or applying extra loss restrictions during training. However, these methods either require a substantial amount of data or complicate the training process. In this work, we propose a stable learning-based training scheme that involves a Sample Weight Learning (SWL) module, addressing distribution shift by decorrelating all selected features via learning weights from training samples. The proposed portable plug-in-like SWL is easy to apply to multiple base models and generalizes them without using extra data during training. Experiments conducted on the ASVspoof datasets clearly demonstrate the effectiveness of SWL in generalizing different models across three evaluation datasets from different distributions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.03237v1-abstract-full').style.display = 'none'; document.getElementById('2406.03237v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted by INTERSPEECH2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.04880">arXiv:2405.04880</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2405.04880">pdf</a>, <a href="https://arxiv.org/format/2405.04880">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> The Codecfake Dataset and Countermeasures for the Universally Detection of Deepfake Audio </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Xie%2C+Y">Yuankun Xie</a>, <a href="/search/eess?searchtype=author&amp;query=Lu%2C+Y">Yi Lu</a>, <a href="/search/eess?searchtype=author&amp;query=Fu%2C+R">Ruibo Fu</a>, <a href="/search/eess?searchtype=author&amp;query=Wen%2C+Z">Zhengqi Wen</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+Z">Zhiyong Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Tao%2C+J">Jianhua Tao</a>, <a href="/search/eess?searchtype=author&amp;query=Qi%2C+X">Xin Qi</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+X">Xiaopeng Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+Y">Yukun Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Cheng%2C+H">Haonan Cheng</a>, <a href="/search/eess?searchtype=author&amp;query=Ye%2C+L">Long Ye</a>, <a href="/search/eess?searchtype=author&amp;query=Sun%2C+Y">Yi Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.04880v2-abstract-short" style="display: inline;"> With the proliferation of Audio Language Model (ALM) based deepfake audio, there is an urgent need for generalized detection methods. ALM-based deepfake audio currently exhibits widespread, high deception, and type versatility, posing a significant challenge to current audio deepfake detection (ADD) models trained solely on vocoded data. To effectively detect ALM-based deepfake audio, we focus on&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.04880v2-abstract-full').style.display = 'inline'; document.getElementById('2405.04880v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.04880v2-abstract-full" style="display: none;"> With the proliferation of Audio Language Model (ALM) based deepfake audio, there is an urgent need for generalized detection methods. ALM-based deepfake audio currently exhibits widespread, high deception, and type versatility, posing a significant challenge to current audio deepfake detection (ADD) models trained solely on vocoded data. To effectively detect ALM-based deepfake audio, we focus on the mechanism of the ALM-based audio generation method, the conversion from neural codec to waveform. We initially construct the Codecfake dataset, an open-source large-scale dataset, including 2 languages, over 1M audio samples, and various test conditions, focus on ALM-based audio detection. As countermeasure, to achieve universal detection of deepfake audio and tackle domain ascent bias issue of original SAM, we propose the CSAM strategy to learn a domain balanced and generalized minima. In our experiments, we first demonstrate that ADD model training with the Codecfake dataset can effectively detects ALM-based audio. Furthermore, our proposed generalization countermeasure yields the lowest average Equal Error Rate (EER) of 0.616% across all test conditions compared to baseline models. The dataset and associated code are available online. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.04880v2-abstract-full').style.display = 'none'; document.getElementById('2405.04880v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.08869">arXiv:2310.08869</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2310.08869">pdf</a>, <a href="https://arxiv.org/format/2310.08869">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TASLP.2024.3389643">10.1109/TASLP.2024.3389643 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Dual-Branch Knowledge Distillation for Noise-Robust Synthetic Speech Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Fan%2C+C">Cunhang Fan</a>, <a href="/search/eess?searchtype=author&amp;query=Ding%2C+M">Mingming Ding</a>, <a href="/search/eess?searchtype=author&amp;query=Tao%2C+J">Jianhua Tao</a>, <a href="/search/eess?searchtype=author&amp;query=Fu%2C+R">Ruibo Fu</a>, <a href="/search/eess?searchtype=author&amp;query=Yi%2C+J">Jiangyan Yi</a>, <a href="/search/eess?searchtype=author&amp;query=Wen%2C+Z">Zhengqi Wen</a>, <a href="/search/eess?searchtype=author&amp;query=Lv%2C+Z">Zhao Lv</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.08869v2-abstract-short" style="display: inline;"> Most research in synthetic speech detection (SSD) focuses on improving performance on standard noise-free datasets. However, in actual situations, noise interference is usually present, causing significant performance degradation in SSD systems. To improve noise robustness, this paper proposes a dual-branch knowledge distillation synthetic speech detection (DKDSSD) method. Specifically, a parallel&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.08869v2-abstract-full').style.display = 'inline'; document.getElementById('2310.08869v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.08869v2-abstract-full" style="display: none;"> Most research in synthetic speech detection (SSD) focuses on improving performance on standard noise-free datasets. However, in actual situations, noise interference is usually present, causing significant performance degradation in SSD systems. To improve noise robustness, this paper proposes a dual-branch knowledge distillation synthetic speech detection (DKDSSD) method. Specifically, a parallel data flow of the clean teacher branch and the noisy student branch is designed, and interactive fusion module and response-based teacher-student paradigms are proposed to guide the training of noisy data from both the data distribution and decision-making perspectives. In the noisy student branch, speech enhancement is introduced initially for denoising, aiming to reduce the interference of strong noise. The proposed interactive fusion combines denoised features and noisy features to mitigate the impact of speech distortion and ensure consistency with the data distribution of the clean branch. The teacher-student paradigm maps the student&#39;s decision space to the teacher&#39;s decision space, enabling noisy speech to behave similarly to clean speech. Additionally, a joint training method is employed to optimize both branches for achieving global optimality. Experimental results based on multiple datasets demonstrate that the proposed method performs effectively in noisy environments and maintains its performance in cross-dataset experiments. Source code is available at https://github.com/fchest/DKDSSD. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.08869v2-abstract-full').style.display = 'none'; document.getElementById('2310.08869v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.00424">arXiv:2309.00424</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2309.00424">pdf</a>, <a href="https://arxiv.org/format/2309.00424">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Learning Speech Representation From Contrastive Token-Acoustic Pretraining </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Qiang%2C+C">Chunyu Qiang</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+H">Hao Li</a>, <a href="/search/eess?searchtype=author&amp;query=Tian%2C+Y">Yixin Tian</a>, <a href="/search/eess?searchtype=author&amp;query=Fu%2C+R">Ruibo Fu</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+T">Tao Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+L">Longbiao Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Dang%2C+J">Jianwu Dang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.00424v5-abstract-short" style="display: inline;"> For fine-grained generation and recognition tasks such as minimally-supervised text-to-speech (TTS), voice conversion (VC), and automatic speech recognition (ASR), the intermediate representations extracted from speech should serve as a &#34;bridge&#34; between text and acoustic information, containing information from both modalities. The semantic content is emphasized, while the paralinguistic informati&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.00424v5-abstract-full').style.display = 'inline'; document.getElementById('2309.00424v5-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.00424v5-abstract-full" style="display: none;"> For fine-grained generation and recognition tasks such as minimally-supervised text-to-speech (TTS), voice conversion (VC), and automatic speech recognition (ASR), the intermediate representations extracted from speech should serve as a &#34;bridge&#34; between text and acoustic information, containing information from both modalities. The semantic content is emphasized, while the paralinguistic information such as speaker identity and acoustic details should be de-emphasized. However, existing methods for extracting fine-grained intermediate representations from speech suffer from issues of excessive redundancy and dimension explosion. Contrastive learning is a good method for modeling intermediate representations from two modalities. However, existing contrastive learning methods in the audio field focus on extracting global descriptive information for downstream audio classification tasks, making them unsuitable for TTS, VC, and ASR tasks. To address these issues, we propose a method named &#34;Contrastive Token-Acoustic Pretraining (CTAP)&#34;, which uses two encoders to bring phoneme and speech into a joint multimodal space, learning how to connect phoneme and speech at the frame level. The CTAP model is trained on 210k speech and phoneme pairs, achieving minimally-supervised TTS, VC, and ASR. The proposed CTAP method offers a promising solution for fine-grained generation and recognition downstream tasks in speech processing. We provide a website with audio samples. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.00424v5-abstract-full').style.display = 'none'; document.getElementById('2309.00424v5-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICASSP 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2307.15484">arXiv:2307.15484</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2307.15484">pdf</a>, <a href="https://arxiv.org/format/2307.15484">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Minimally-Supervised Speech Synthesis with Conditional Diffusion Model and Language Model: A Comparative Study of Semantic Coding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Qiang%2C+C">Chunyu Qiang</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+H">Hao Li</a>, <a href="/search/eess?searchtype=author&amp;query=Ni%2C+H">Hao Ni</a>, <a href="/search/eess?searchtype=author&amp;query=Qu%2C+H">He Qu</a>, <a href="/search/eess?searchtype=author&amp;query=Fu%2C+R">Ruibo Fu</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+T">Tao Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+L">Longbiao Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Dang%2C+J">Jianwu Dang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2307.15484v3-abstract-short" style="display: inline;"> Recently, there has been a growing interest in text-to-speech (TTS) methods that can be trained with minimal supervision by combining two types of discrete speech representations and using two sequence-to-sequence tasks to decouple TTS. However, existing methods suffer from three problems: the high dimensionality and waveform distortion of discrete speech representations, the prosodic averaging pr&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.15484v3-abstract-full').style.display = 'inline'; document.getElementById('2307.15484v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2307.15484v3-abstract-full" style="display: none;"> Recently, there has been a growing interest in text-to-speech (TTS) methods that can be trained with minimal supervision by combining two types of discrete speech representations and using two sequence-to-sequence tasks to decouple TTS. However, existing methods suffer from three problems: the high dimensionality and waveform distortion of discrete speech representations, the prosodic averaging problem caused by the duration prediction model in non-autoregressive frameworks, and the information redundancy and dimension explosion problems of existing semantic encoding methods. To address these problems, three progressive methods are proposed. First, we propose Diff-LM-Speech, an autoregressive structure consisting of a language model and diffusion models, which models the semantic embedding into the mel-spectrogram based on a diffusion model to achieve higher audio quality. We also introduce a prompt encoder structure based on a variational autoencoder and a prosody bottleneck to improve prompt representation ability. Second, we propose Tetra-Diff-Speech, a non-autoregressive structure consisting of four diffusion model-based modules that design a duration diffusion model to achieve diverse prosodic expressions. Finally, we propose Tri-Diff-Speech, a non-autoregressive structure consisting of three diffusion model-based modules that verify the non-necessity of existing semantic encoding models and achieve the best results. Experimental results show that our proposed methods outperform baseline methods. We provide a website with audio samples. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.15484v3-abstract-full').style.display = 'none'; document.getElementById('2307.15484v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICASSP 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.05617">arXiv:2306.05617</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2306.05617">pdf</a>, <a href="https://arxiv.org/format/2306.05617">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Low-rank Adaptation Method for Wav2vec2-based Fake Audio Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Wang%2C+C">Chenglong Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Yi%2C+J">Jiangyan Yi</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+X">Xiaohui Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Tao%2C+J">Jianhua Tao</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+L">Le Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Fu%2C+R">Ruibo Fu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.05617v1-abstract-short" style="display: inline;"> Self-supervised speech models are a rapidly developing research topic in fake audio detection. Many pre-trained models can serve as feature extractors, learning richer and higher-level speech features. However,when fine-tuning pre-trained models, there is often a challenge of excessively long training times and high memory consumption, and complete fine-tuning is also very expensive. To alleviate&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.05617v1-abstract-full').style.display = 'inline'; document.getElementById('2306.05617v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.05617v1-abstract-full" style="display: none;"> Self-supervised speech models are a rapidly developing research topic in fake audio detection. Many pre-trained models can serve as feature extractors, learning richer and higher-level speech features. However,when fine-tuning pre-trained models, there is often a challenge of excessively long training times and high memory consumption, and complete fine-tuning is also very expensive. To alleviate this problem, we apply low-rank adaptation(LoRA) to the wav2vec2 model, freezing the pre-trained model weights and injecting a trainable rank-decomposition matrix into each layer of the transformer architecture, greatly reducing the number of trainable parameters for downstream tasks. Compared with fine-tuning with Adam on the wav2vec2 model containing 317M training parameters, LoRA achieved similar performance by reducing the number of trainable parameters by 198 times. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.05617v1-abstract-full').style.display = 'none'; document.getElementById('2306.05617v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">6pages</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> IJCAI 2023 Workshop on Deepfake Audio Detection and Analysis </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.04956">arXiv:2306.04956</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2306.04956">pdf</a>, <a href="https://arxiv.org/format/2306.04956">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Adaptive Fake Audio Detection with Low-Rank Model Squeezing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+X">Xiaohui Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Yi%2C+J">Jiangyan Yi</a>, <a href="/search/eess?searchtype=author&amp;query=Tao%2C+J">Jianhua Tao</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+C">Chenlong Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+L">Le Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Fu%2C+R">Ruibo Fu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.04956v1-abstract-short" style="display: inline;"> The rapid advancement of spoofing algorithms necessitates the development of robust detection methods capable of accurately identifying emerging fake audio. Traditional approaches, such as finetuning on new datasets containing these novel spoofing algorithms, are computationally intensive and pose a risk of impairing the acquired knowledge of known fake audio types. To address these challenges, th&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.04956v1-abstract-full').style.display = 'inline'; document.getElementById('2306.04956v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.04956v1-abstract-full" style="display: none;"> The rapid advancement of spoofing algorithms necessitates the development of robust detection methods capable of accurately identifying emerging fake audio. Traditional approaches, such as finetuning on new datasets containing these novel spoofing algorithms, are computationally intensive and pose a risk of impairing the acquired knowledge of known fake audio types. To address these challenges, this paper proposes an innovative approach that mitigates the limitations associated with finetuning. We introduce the concept of training low-rank adaptation matrices tailored specifically to the newly emerging fake audio types. During the inference stage, these adaptation matrices are combined with the existing model to generate the final prediction output. Extensive experimentation is conducted to evaluate the efficacy of the proposed method. The results demonstrate that our approach effectively preserves the prediction accuracy of the existing model for known fake audio types. Furthermore, our approach offers several advantages, including reduced storage memory requirements and lower equal error rates compared to conventional finetuning methods, particularly on specific spoofing algorithms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.04956v1-abstract-full').style.display = 'none'; document.getElementById('2306.04956v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> DADA workshop on IJCAI 2023 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.13774">arXiv:2305.13774</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2305.13774">pdf</a>, <a href="https://arxiv.org/format/2305.13774">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> ADD 2023: the Second Audio Deepfake Detection Challenge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Yi%2C+J">Jiangyan Yi</a>, <a href="/search/eess?searchtype=author&amp;query=Tao%2C+J">Jianhua Tao</a>, <a href="/search/eess?searchtype=author&amp;query=Fu%2C+R">Ruibo Fu</a>, <a href="/search/eess?searchtype=author&amp;query=Yan%2C+X">Xinrui Yan</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+C">Chenglong Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+T">Tao Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+C+Y">Chu Yuan Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+X">Xiaohui Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Zhao%2C+Y">Yan Zhao</a>, <a href="/search/eess?searchtype=author&amp;query=Ren%2C+Y">Yong Ren</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+L">Le Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Zhou%2C+J">Junzuo Zhou</a>, <a href="/search/eess?searchtype=author&amp;query=Gu%2C+H">Hao Gu</a>, <a href="/search/eess?searchtype=author&amp;query=Wen%2C+Z">Zhengqi Wen</a>, <a href="/search/eess?searchtype=author&amp;query=Liang%2C+S">Shan Liang</a>, <a href="/search/eess?searchtype=author&amp;query=Lian%2C+Z">Zheng Lian</a>, <a href="/search/eess?searchtype=author&amp;query=Nie%2C+S">Shuai Nie</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+H">Haizhou Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.13774v1-abstract-short" style="display: inline;"> Audio deepfake detection is an emerging topic in the artificial intelligence community. The second Audio Deepfake Detection Challenge (ADD 2023) aims to spur researchers around the world to build new innovative technologies that can further accelerate and foster research on detecting and analyzing deepfake speech utterances. Different from previous challenges (e.g. ADD 2022), ADD 2023 focuses on s&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.13774v1-abstract-full').style.display = 'inline'; document.getElementById('2305.13774v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.13774v1-abstract-full" style="display: none;"> Audio deepfake detection is an emerging topic in the artificial intelligence community. The second Audio Deepfake Detection Challenge (ADD 2023) aims to spur researchers around the world to build new innovative technologies that can further accelerate and foster research on detecting and analyzing deepfake speech utterances. Different from previous challenges (e.g. ADD 2022), ADD 2023 focuses on surpassing the constraints of binary real/fake classification, and actually localizing the manipulated intervals in a partially fake speech as well as pinpointing the source responsible for generating any fake audio. Furthermore, ADD 2023 includes more rounds of evaluation for the fake audio game sub-challenge. The ADD 2023 challenge includes three subchallenges: audio fake game (FG), manipulation region location (RL) and deepfake algorithm recognition (AR). This paper describes the datasets, evaluation metrics, and protocols. Some findings are also reported in audio deepfake detection tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.13774v1-abstract-full').style.display = 'none'; document.getElementById('2305.13774v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.13701">arXiv:2305.13701</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2305.13701">pdf</a>, <a href="https://arxiv.org/format/2305.13701">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> TO-Rawnet: Improving RawNet with TCN and Orthogonal Regularization for Fake Audio Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Wang%2C+C">Chenglong Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Yi%2C+J">Jiangyan Yi</a>, <a href="/search/eess?searchtype=author&amp;query=Tao%2C+J">Jianhua Tao</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+C">Chuyuan Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+S">Shuai Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Fu%2C+R">Ruibo Fu</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+X">Xun Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.13701v1-abstract-short" style="display: inline;"> Current fake audio detection relies on hand-crafted features, which lose information during extraction. To overcome this, recent studies use direct feature extraction from raw audio signals. For example, RawNet is one of the representative works in end-to-end fake audio detection. However, existing work on RawNet does not optimize the parameters of the Sinc-conv during training, which limited its&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.13701v1-abstract-full').style.display = 'inline'; document.getElementById('2305.13701v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.13701v1-abstract-full" style="display: none;"> Current fake audio detection relies on hand-crafted features, which lose information during extraction. To overcome this, recent studies use direct feature extraction from raw audio signals. For example, RawNet is one of the representative works in end-to-end fake audio detection. However, existing work on RawNet does not optimize the parameters of the Sinc-conv during training, which limited its performance. In this paper, we propose to incorporate orthogonal convolution into RawNet, which reduces the correlation between filters when optimizing the parameters of Sinc-conv, thus improving discriminability. Additionally, we introduce temporal convolutional networks (TCN) to capture long-term dependencies in speech signals. Experiments on the ASVspoof 2019 show that the Our TO-RawNet system can relatively reduce EER by 66.09\% on logical access scenario compared with the RawNet, demonstrating its effectiveness in detecting fake audio attacks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.13701v1-abstract-full').style.display = 'none'; document.getElementById('2305.13701v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Interspeech2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2304.02978">arXiv:2304.02978</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2304.02978">pdf</a>, <a href="https://arxiv.org/format/2304.02978">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Simplifying Low-Light Image Enhancement Networks with Relative Loss Functions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+Y">Yu Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Di%2C+X">Xiaoguang Di</a>, <a href="/search/eess?searchtype=author&amp;query=Wu%2C+J">Junde Wu</a>, <a href="/search/eess?searchtype=author&amp;query=Fu%2C+R">Rao Fu</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+Y">Yong Li</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+Y">Yue Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+Y">Yanwu Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Yang%2C+G">Guohui Yang</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+C">Chunhui Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2304.02978v2-abstract-short" style="display: inline;"> Image enhancement is a common technique used to mitigate issues such as severe noise, low brightness, low contrast, and color deviation in low-light images. However, providing an optimal high-light image as a reference for low-light image enhancement tasks is impossible, which makes the learning process more difficult than other image processing tasks. As a result, although several low-light image&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.02978v2-abstract-full').style.display = 'inline'; document.getElementById('2304.02978v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2304.02978v2-abstract-full" style="display: none;"> Image enhancement is a common technique used to mitigate issues such as severe noise, low brightness, low contrast, and color deviation in low-light images. However, providing an optimal high-light image as a reference for low-light image enhancement tasks is impossible, which makes the learning process more difficult than other image processing tasks. As a result, although several low-light image enhancement methods have been proposed, most of them are either too complex or insufficient in addressing all the issues in low-light images. In this paper, to make the learning easier in low-light image enhancement, we introduce FLW-Net (Fast and LightWeight Network) and two relative loss functions. Specifically, we first recognize the challenges of the need for a large receptive field to obtain global contrast and the lack of an absolute reference, which limits the simplification of network structures in this task. Then, we propose an efficient global feature information extraction component and two loss functions based on relative information to overcome these challenges. Finally, we conducted comparative experiments to demonstrate the effectiveness of the proposed method, and the results confirm that the proposed method can significantly reduce the complexity of supervised low-light image enhancement networks while improving processing effect. The code is available at \url{https://github.com/hitzhangyu/FLW-Net}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.02978v2-abstract-full').style.display = 'none'; document.getElementById('2304.02978v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 April, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">19 pages, 11 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68Txx <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.4.3 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2301.03801">arXiv:2301.03801</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2301.03801">pdf</a>, <a href="https://arxiv.org/format/2301.03801">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> UnifySpeech: A Unified Framework for Zero-shot Text-to-Speech and Voice Conversion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Liu%2C+H">Haogeng Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+T">Tao Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Fu%2C+R">Ruibo Fu</a>, <a href="/search/eess?searchtype=author&amp;query=Yi%2C+J">Jiangyan Yi</a>, <a href="/search/eess?searchtype=author&amp;query=Wen%2C+Z">Zhengqi Wen</a>, <a href="/search/eess?searchtype=author&amp;query=Tao%2C+J">Jianhua Tao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2301.03801v1-abstract-short" style="display: inline;"> Text-to-speech (TTS) and voice conversion (VC) are two different tasks both aiming at generating high quality speaking voice according to different input modality. Due to their similarity, this paper proposes UnifySpeech, which brings TTS and VC into a unified framework for the first time. The model is based on the assumption that speech can be decoupled into three independent components: content&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2301.03801v1-abstract-full').style.display = 'inline'; document.getElementById('2301.03801v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2301.03801v1-abstract-full" style="display: none;"> Text-to-speech (TTS) and voice conversion (VC) are two different tasks both aiming at generating high quality speaking voice according to different input modality. Due to their similarity, this paper proposes UnifySpeech, which brings TTS and VC into a unified framework for the first time. The model is based on the assumption that speech can be decoupled into three independent components: content information, speaker information, prosody information. Both TTS and VC can be regarded as mining these three parts of information from the input and completing the reconstruction of speech. For TTS, the speech content information is derived from the text, while in VC it&#39;s derived from the source speech, so all the remaining units are shared except for the speech content extraction module in the two tasks. We applied vector quantization and domain constrain to bridge the gap between the content domains of TTS and VC. Objective and subjective evaluation shows that by combining the two task, TTS obtains better speaker modeling ability while VC gets hold of impressive speech content decoupling capability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2301.03801v1-abstract-full').style.display = 'none'; document.getElementById('2301.03801v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 January, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2212.10191">arXiv:2212.10191</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2212.10191">pdf</a>, <a href="https://arxiv.org/format/2212.10191">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Emotion Selectable End-to-End Text-based Speech Editing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Wang%2C+T">Tao Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Yi%2C+J">Jiangyan Yi</a>, <a href="/search/eess?searchtype=author&amp;query=Fu%2C+R">Ruibo Fu</a>, <a href="/search/eess?searchtype=author&amp;query=Tao%2C+J">Jianhua Tao</a>, <a href="/search/eess?searchtype=author&amp;query=Wen%2C+Z">Zhengqi Wen</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+C+Y">Chu Yuan Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2212.10191v1-abstract-short" style="display: inline;"> Text-based speech editing allows users to edit speech by intuitively cutting, copying, and pasting text to speed up the process of editing speech. In the previous work, CampNet (context-aware mask prediction network) is proposed to realize text-based speech editing, significantly improving the quality of edited speech. This paper aims at a new task: adding emotional effect to the editing speech du&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.10191v1-abstract-full').style.display = 'inline'; document.getElementById('2212.10191v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2212.10191v1-abstract-full" style="display: none;"> Text-based speech editing allows users to edit speech by intuitively cutting, copying, and pasting text to speed up the process of editing speech. In the previous work, CampNet (context-aware mask prediction network) is proposed to realize text-based speech editing, significantly improving the quality of edited speech. This paper aims at a new task: adding emotional effect to the editing speech during the text-based speech editing to make the generated speech more expressive. To achieve this task, we propose Emo-CampNet (emotion CampNet), which can provide the option of emotional attributes for the generated speech in text-based speech editing and has the one-shot ability to edit unseen speakers&#39; speech. Firstly, we propose an end-to-end emotion-selectable text-based speech editing model. The key idea of the model is to control the emotion of generated speech by introducing additional emotion attributes based on the context-aware mask prediction network. Secondly, to prevent the emotion of the generated speech from being interfered by the emotional components in the original speech, a neutral content generator is proposed to remove the emotion from the original speech, which is optimized by the generative adversarial framework. Thirdly, two data augmentation methods are proposed to enrich the emotional and pronunciation information in the training set, which can enable the model to edit the unseen speaker&#39;s speech. The experimental results that 1) Emo-CampNet can effectively control the emotion of the generated speech in the process of text-based speech editing; And can edit unseen speakers&#39; speech. 2) Detailed ablation experiments further prove the effectiveness of emotional selectivity and data augmentation methods. The demo page is available at https://hairuo55.github.io/Emo-CampNet/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.10191v1-abstract-full').style.display = 'none'; document.getElementById('2212.10191v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 December, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Under review, 12 pages, 11 figures, demo page is available at https://hairuo55.github.io/Emo-CampNet/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2211.06073">arXiv:2211.06073</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2211.06073">pdf</a>, <a href="https://arxiv.org/format/2211.06073">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> SceneFake: An Initial Dataset and Benchmarks for Scene Fake Audio Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Yi%2C+J">Jiangyan Yi</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+C">Chenglong Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Tao%2C+J">Jianhua Tao</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+C+Y">Chu Yuan Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Fan%2C+C">Cunhang Fan</a>, <a href="/search/eess?searchtype=author&amp;query=Tian%2C+Z">Zhengkun Tian</a>, <a href="/search/eess?searchtype=author&amp;query=Ma%2C+H">Haoxin Ma</a>, <a href="/search/eess?searchtype=author&amp;query=Fu%2C+R">Ruibo Fu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2211.06073v2-abstract-short" style="display: inline;"> Many datasets have been designed to further the development of fake audio detection. However, fake utterances in previous datasets are mostly generated by altering timbre, prosody, linguistic content or channel noise of original audio. These datasets leave out a scenario, in which the acoustic scene of an original audio is manipulated with a forged one. It will pose a major threat to our society i&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.06073v2-abstract-full').style.display = 'inline'; document.getElementById('2211.06073v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2211.06073v2-abstract-full" style="display: none;"> Many datasets have been designed to further the development of fake audio detection. However, fake utterances in previous datasets are mostly generated by altering timbre, prosody, linguistic content or channel noise of original audio. These datasets leave out a scenario, in which the acoustic scene of an original audio is manipulated with a forged one. It will pose a major threat to our society if some people misuse the manipulated audio with malicious purpose. Therefore, this motivates us to fill in the gap. This paper proposes such a dataset for scene fake audio detection named SceneFake, where a manipulated audio is generated by only tampering with the acoustic scene of an real utterance by using speech enhancement technologies. Some scene fake audio detection benchmark results on the SceneFake dataset are reported in this paper. In addition, an analysis of fake attacks with different speech enhancement technologies and signal-to-noise ratios are presented in this paper. The results indicate that scene fake utterances cannot be reliably detected by baseline models trained on the ASVspoof 2019 dataset. Although these models perform well on the SceneFake training set and seen testing set, their performance is poor on the unseen test set. The dataset (https://zenodo.org/record/7663324#.Y_XKMuPYuUk) and benchmark source codes (https://github.com/ADDchallenge/SceneFake) are publicly available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.06073v2-abstract-full').style.display = 'none'; document.getElementById('2211.06073v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by Pattern Recognition, 1 April 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2210.11429">arXiv:2210.11429</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2210.11429">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Text Enhancement for Paragraph Processing in End-to-End Code-switching TTS </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Qiang%2C+C">Chunyu Qiang</a>, <a href="/search/eess?searchtype=author&amp;query=Tao%2C+J">Jianhua Tao</a>, <a href="/search/eess?searchtype=author&amp;query=Fu%2C+R">Ruibo Fu</a>, <a href="/search/eess?searchtype=author&amp;query=Wen%2C+Z">Zhengqi Wen</a>, <a href="/search/eess?searchtype=author&amp;query=Yi%2C+J">Jiangyan Yi</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+T">Tao Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+S">Shiming Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2210.11429v1-abstract-short" style="display: inline;"> Current end-to-end code-switching Text-to-Speech (TTS) can already generate high quality two languages speech in the same utterance with single speaker bilingual corpora. When the speakers of the bilingual corpora are different, the naturalness and consistency of the code-switching TTS will be poor. The cross-lingual embedding layers structure we proposed makes similar syllables in different langu&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.11429v1-abstract-full').style.display = 'inline'; document.getElementById('2210.11429v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2210.11429v1-abstract-full" style="display: none;"> Current end-to-end code-switching Text-to-Speech (TTS) can already generate high quality two languages speech in the same utterance with single speaker bilingual corpora. When the speakers of the bilingual corpora are different, the naturalness and consistency of the code-switching TTS will be poor. The cross-lingual embedding layers structure we proposed makes similar syllables in different languages relevant, thus improving the naturalness and consistency of generated speech. In the end-to-end code-switching TTS, there exists problem of prosody instability when synthesizing paragraph text. The text enhancement method we proposed makes the input contain prosodic information and sentence-level context information, thus improving the prosody stability of paragraph text. Experimental results demonstrate the effectiveness of the proposed methods in the naturalness, consistency, and prosody stability. In addition to Mandarin and English, we also apply these methods to Shanghaiese and Cantonese corpora, proving that the methods we proposed can be extended to other languages to build end-to-end code-switching TTS system. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.11429v1-abstract-full').style.display = 'none'; document.getElementById('2210.11429v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted in ISCSLP 2021</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2208.09646">arXiv:2208.09646</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2208.09646">pdf</a>, <a href="https://arxiv.org/format/2208.09646">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3552466.3556525">10.1145/3552466.3556525 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> An Initial Investigation for Detecting Vocoder Fingerprints of Fake Audio </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Yan%2C+X">Xinrui Yan</a>, <a href="/search/eess?searchtype=author&amp;query=Yi%2C+J">Jiangyan Yi</a>, <a href="/search/eess?searchtype=author&amp;query=Tao%2C+J">Jianhua Tao</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+C">Chenglong Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Ma%2C+H">Haoxin Ma</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+T">Tao Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+S">Shiming Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Fu%2C+R">Ruibo Fu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2208.09646v1-abstract-short" style="display: inline;"> Many effective attempts have been made for fake audio detection. However, they can only provide detection results but no countermeasures to curb this harm. For many related practical applications, what model or algorithm generated the fake audio also is needed. Therefore, We propose a new problem for detecting vocoder fingerprints of fake audio. Experiments are conducted on the datasets synthesize&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2208.09646v1-abstract-full').style.display = 'inline'; document.getElementById('2208.09646v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2208.09646v1-abstract-full" style="display: none;"> Many effective attempts have been made for fake audio detection. However, they can only provide detection results but no countermeasures to curb this harm. For many related practical applications, what model or algorithm generated the fake audio also is needed. Therefore, We propose a new problem for detecting vocoder fingerprints of fake audio. Experiments are conducted on the datasets synthesized by eight state-of-the-art vocoders. We have preliminarily explored the features and model architectures. The t-SNE visualization shows that different vocoders generate distinct vocoder fingerprints. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2208.09646v1-abstract-full').style.display = 'none'; document.getElementById('2208.09646v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 August, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ACM Multimedia 2022 Workshop: First International Workshop on Deepfake Detection for Audio Multimedia</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2208.09618">arXiv:2208.09618</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2208.09618">pdf</a>, <a href="https://arxiv.org/format/2208.09618">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Fully Automated End-to-End Fake Audio Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Wang%2C+C">Chenglong Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Yi%2C+J">Jiangyan Yi</a>, <a href="/search/eess?searchtype=author&amp;query=Tao%2C+J">Jianhua Tao</a>, <a href="/search/eess?searchtype=author&amp;query=Sun%2C+H">Haiyang Sun</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+X">Xun Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Tian%2C+Z">Zhengkun Tian</a>, <a href="/search/eess?searchtype=author&amp;query=Ma%2C+H">Haoxin Ma</a>, <a href="/search/eess?searchtype=author&amp;query=Fan%2C+C">Cunhang Fan</a>, <a href="/search/eess?searchtype=author&amp;query=Fu%2C+R">Ruibo Fu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2208.09618v1-abstract-short" style="display: inline;"> The existing fake audio detection systems often rely on expert experience to design the acoustic features or manually design the hyperparameters of the network structure. However, artificial adjustment of the parameters can have a relatively obvious influence on the results. It is almost impossible to manually set the best set of parameters. Therefore this paper proposes a fully automated end-toen&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2208.09618v1-abstract-full').style.display = 'inline'; document.getElementById('2208.09618v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2208.09618v1-abstract-full" style="display: none;"> The existing fake audio detection systems often rely on expert experience to design the acoustic features or manually design the hyperparameters of the network structure. However, artificial adjustment of the parameters can have a relatively obvious influence on the results. It is almost impossible to manually set the best set of parameters. Therefore this paper proposes a fully automated end-toend fake audio detection method. We first use wav2vec pre-trained model to obtain a high-level representation of the speech. Furthermore, for the network structure, we use a modified version of the differentiable architecture search (DARTS) named light-DARTS. It learns deep speech representations while automatically learning and optimizing complex neural structures consisting of convolutional operations and residual blocks. The experimental results on the ASVspoof 2019 LA dataset show that our proposed system achieves an equal error rate (EER) of 1.08%, which outperforms the state-of-the-art single system. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2208.09618v1-abstract-full').style.display = 'none'; document.getElementById('2208.09618v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 August, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2207.12308">arXiv:2207.12308</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2207.12308">pdf</a>, <a href="https://arxiv.org/format/2207.12308">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> CFAD: A Chinese Dataset for Fake Audio Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Ma%2C+H">Haoxin Ma</a>, <a href="/search/eess?searchtype=author&amp;query=Yi%2C+J">Jiangyan Yi</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+C">Chenglong Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Yan%2C+X">Xinrui Yan</a>, <a href="/search/eess?searchtype=author&amp;query=Tao%2C+J">Jianhua Tao</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+T">Tao Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+S">Shiming Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Fu%2C+R">Ruibo Fu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2207.12308v3-abstract-short" style="display: inline;"> Fake audio detection is a growing concern and some relevant datasets have been designed for research. However, there is no standard public Chinese dataset under complex conditions.In this paper, we aim to fill in the gap and design a Chinese fake audio detection dataset (CFAD) for studying more generalized detection methods. Twelve mainstream speech-generation techniques are used to generate fake&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2207.12308v3-abstract-full').style.display = 'inline'; document.getElementById('2207.12308v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2207.12308v3-abstract-full" style="display: none;"> Fake audio detection is a growing concern and some relevant datasets have been designed for research. However, there is no standard public Chinese dataset under complex conditions.In this paper, we aim to fill in the gap and design a Chinese fake audio detection dataset (CFAD) for studying more generalized detection methods. Twelve mainstream speech-generation techniques are used to generate fake audio. To simulate the real-life scenarios, three noise datasets are selected for noise adding at five different signal-to-noise ratios, and six codecs are considered for audio transcoding (format conversion). CFAD dataset can be used not only for fake audio detection but also for detecting the algorithms of fake utterances for audio forensics. Baseline results are presented with analysis. The results that show fake audio detection methods with generalization remain challenging. The CFAD dataset is publicly available at: https://zenodo.org/record/8122764. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2207.12308v3-abstract-full').style.display = 'none'; document.getElementById('2207.12308v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 July, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">FAD renamed as CFAD</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2203.02678">arXiv:2203.02678</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2203.02678">pdf</a>, <a href="https://arxiv.org/format/2203.02678">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TASLP.2022.3140480">10.1109/TASLP.2022.3140480 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> NeuralDPS: Neural Deterministic Plus Stochastic Model with Multiband Excitation for Noise-Controllable Waveform Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Wang%2C+T">Tao Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Fu%2C+R">Ruibo Fu</a>, <a href="/search/eess?searchtype=author&amp;query=Yi%2C+J">Jiangyan Yi</a>, <a href="/search/eess?searchtype=author&amp;query=Tao%2C+J">Jianhua Tao</a>, <a href="/search/eess?searchtype=author&amp;query=Wen%2C+Z">Zhengqi Wen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2203.02678v1-abstract-short" style="display: inline;"> The traditional vocoders have the advantages of high synthesis efficiency, strong interpretability, and speech editability, while the neural vocoders have the advantage of high synthesis quality. To combine the advantages of two vocoders, inspired by the traditional deterministic plus stochastic model, this paper proposes a novel neural vocoder named NeuralDPS which can retain high speech quality&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.02678v1-abstract-full').style.display = 'inline'; document.getElementById('2203.02678v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2203.02678v1-abstract-full" style="display: none;"> The traditional vocoders have the advantages of high synthesis efficiency, strong interpretability, and speech editability, while the neural vocoders have the advantage of high synthesis quality. To combine the advantages of two vocoders, inspired by the traditional deterministic plus stochastic model, this paper proposes a novel neural vocoder named NeuralDPS which can retain high speech quality and acquire high synthesis efficiency and noise controllability. Firstly, this framework contains four modules: a deterministic source module, a stochastic source module, a neural V/UV decision module and a neural filter module. The input required by the vocoder is just the spectral parameter, which avoids the error caused by estimating additional parameters, such as F0. Secondly, to solve the problem that different frequency bands may have different proportions of deterministic components and stochastic components, a multiband excitation strategy is used to generate a more accurate excitation signal and reduce the neural filter&#39;s burden. Thirdly, a method to control noise components of speech is proposed. In this way, the signal-to-noise ratio (SNR) of speech can be adjusted easily. Objective and subjective experimental results show that our proposed NeuralDPS vocoder can obtain similar performance with the WaveNet and it generates waveforms at least 280 times faster than the WaveNet vocoder. It is also 28% faster than WaveGAN&#39;s synthesis efficiency on a single CPU core. We have also verified through experiments that this method can effectively control the noise components in the predicted speech and adjust the SNR of speech. Examples of generated speech can be found at https://hairuo55.github.io/NeuralDPS. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.02678v1-abstract-full').style.display = 'none'; document.getElementById('2203.02678v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 March, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages, 12 figures; Accepted to TASLP. Demo page https://hairuo55.github.io/NeuralDPS. arXiv admin note: text overlap with arXiv:1906.09573 by other authors</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2202.09950">arXiv:2202.09950</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2202.09950">pdf</a>, <a href="https://arxiv.org/format/2202.09950">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> CampNet: Context-Aware Mask Prediction for End-to-End Text-Based Speech Editing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Wang%2C+T">Tao Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Yi%2C+J">Jiangyan Yi</a>, <a href="/search/eess?searchtype=author&amp;query=Fu%2C+R">Ruibo Fu</a>, <a href="/search/eess?searchtype=author&amp;query=Tao%2C+J">Jianhua Tao</a>, <a href="/search/eess?searchtype=author&amp;query=Wen%2C+Z">Zhengqi Wen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2202.09950v2-abstract-short" style="display: inline;"> The text-based speech editor allows the editing of speech through intuitive cutting, copying, and pasting operations to speed up the process of editing speech. However, the major drawback of current systems is that edited speech often sounds unnatural due to cut-copy-paste operation. In addition, it is not obvious how to synthesize records according to a new word not appearing in the transcript. T&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2202.09950v2-abstract-full').style.display = 'inline'; document.getElementById('2202.09950v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2202.09950v2-abstract-full" style="display: none;"> The text-based speech editor allows the editing of speech through intuitive cutting, copying, and pasting operations to speed up the process of editing speech. However, the major drawback of current systems is that edited speech often sounds unnatural due to cut-copy-paste operation. In addition, it is not obvious how to synthesize records according to a new word not appearing in the transcript. This paper proposes a novel end-to-end text-based speech editing method called context-aware mask prediction network (CampNet). The model can simulate the text-based speech editing process by randomly masking part of speech and then predicting the masked region by sensing the speech context. It can solve unnatural prosody in the edited region and synthesize the speech corresponding to the unseen words in the transcript. Secondly, for the possible operation of text-based speech editing, we design three text-based operations based on CampNet: deletion, insertion, and replacement. These operations can cover various situations of speech editing. Thirdly, to synthesize the speech corresponding to long text in insertion and replacement operations, a word-level autoregressive generation method is proposed. Fourthly, we propose a speaker adaptation method using only one sentence for CampNet and explore the ability of few-shot learning based on CampNet, which provides a new idea for speech forgery tasks. The subjective and objective experiments on VCTK and LibriTTS datasets show that the speech editing results based on CampNet are better than TTS technology, manual editing, and VoCo method. We also conduct detailed ablation experiments to explore the effect of the CampNet structure on its performance. Finally, the experiment shows that speaker adaptation with only one sentence can further improve the naturalness of speech. Examples of generated speech can be found at https://hairuo55.github.io/CampNet. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2202.09950v2-abstract-full').style.display = 'none'; document.getElementById('2202.09950v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 March, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 February, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">under review, 14 pages, 14 figures, demo page is available at https://hairuo55.github.io/CampNet</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2202.08433">arXiv:2202.08433</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2202.08433">pdf</a>, <a href="https://arxiv.org/ps/2202.08433">ps</a>, <a href="https://arxiv.org/format/2202.08433">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> ADD 2022: the First Audio Deep Synthesis Detection Challenge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Yi%2C+J">Jiangyan Yi</a>, <a href="/search/eess?searchtype=author&amp;query=Fu%2C+R">Ruibo Fu</a>, <a href="/search/eess?searchtype=author&amp;query=Tao%2C+J">Jianhua Tao</a>, <a href="/search/eess?searchtype=author&amp;query=Nie%2C+S">Shuai Nie</a>, <a href="/search/eess?searchtype=author&amp;query=Ma%2C+H">Haoxin Ma</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+C">Chenglong Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+T">Tao Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Tian%2C+Z">Zhengkun Tian</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+X">Xiaohui Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Bai%2C+Y">Ye Bai</a>, <a href="/search/eess?searchtype=author&amp;query=Fan%2C+C">Cunhang Fan</a>, <a href="/search/eess?searchtype=author&amp;query=Liang%2C+S">Shan Liang</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+S">Shiming Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+S">Shuai Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Yan%2C+X">Xinrui Yan</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+L">Le Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Wen%2C+Z">Zhengqi Wen</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+H">Haizhou Li</a>, <a href="/search/eess?searchtype=author&amp;query=Lian%2C+Z">Zheng Lian</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+B">Bin Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2202.08433v3-abstract-short" style="display: inline;"> Audio deepfake detection is an emerging topic, which was included in the ASVspoof 2021. However, the recent shared tasks have not covered many real-life and challenging scenarios. The first Audio Deep synthesis Detection challenge (ADD) was motivated to fill in the gap. The ADD 2022 includes three tracks: low-quality fake audio detection (LF), partially fake audio detection (PF) and audio fake gam&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2202.08433v3-abstract-full').style.display = 'inline'; document.getElementById('2202.08433v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2202.08433v3-abstract-full" style="display: none;"> Audio deepfake detection is an emerging topic, which was included in the ASVspoof 2021. However, the recent shared tasks have not covered many real-life and challenging scenarios. The first Audio Deep synthesis Detection challenge (ADD) was motivated to fill in the gap. The ADD 2022 includes three tracks: low-quality fake audio detection (LF), partially fake audio detection (PF) and audio fake game (FG). The LF track focuses on dealing with bona fide and fully fake utterances with various real-world noises etc. The PF track aims to distinguish the partially fake audio from the real. The FG track is a rivalry game, which includes two tasks: an audio generation task and an audio fake detection task. In this paper, we describe the datasets, evaluation metrics, and protocols. We also report major findings that reflect the recent advances in audio deepfake detection tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2202.08433v3-abstract-full').style.display = 'none'; document.getElementById('2202.08433v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 February, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICASSP 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2202.07907">arXiv:2202.07907</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2202.07907">pdf</a>, <a href="https://arxiv.org/format/2202.07907">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Singing-Tacotron: Global duration control attention and dynamic filter for End-to-end singing voice synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Wang%2C+T">Tao Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Fu%2C+R">Ruibo Fu</a>, <a href="/search/eess?searchtype=author&amp;query=Yi%2C+J">Jiangyan Yi</a>, <a href="/search/eess?searchtype=author&amp;query=Tao%2C+J">Jianhua Tao</a>, <a href="/search/eess?searchtype=author&amp;query=Wen%2C+Z">Zhengqi Wen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2202.07907v1-abstract-short" style="display: inline;"> End-to-end singing voice synthesis (SVS) is attractive due to the avoidance of pre-aligned data. However, the auto learned alignment of singing voice with lyrics is difficult to match the duration information in musical score, which will lead to the model instability or even failure to synthesize voice. To learn accurate alignment information automatically, this paper proposes an end-to-end SVS fr&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2202.07907v1-abstract-full').style.display = 'inline'; document.getElementById('2202.07907v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2202.07907v1-abstract-full" style="display: none;"> End-to-end singing voice synthesis (SVS) is attractive due to the avoidance of pre-aligned data. However, the auto learned alignment of singing voice with lyrics is difficult to match the duration information in musical score, which will lead to the model instability or even failure to synthesize voice. To learn accurate alignment information automatically, this paper proposes an end-to-end SVS framework, named Singing-Tacotron. The main difference between the proposed framework and Tacotron is that the speech can be controlled significantly by the musical score&#39;s duration information. Firstly, we propose a global duration control attention mechanism for the SVS model. The attention mechanism can control each phoneme&#39;s duration. Secondly, a duration encoder is proposed to learn a set of global transition tokens from the musical score. These transition tokens can help the attention mechanism decide whether moving to the next phoneme or staying at each decoding step. Thirdly, to further improve the model&#39;s stability, a dynamic filter is designed to help the model overcome noise interference and pay more attention to local context information. Subjective and objective evaluation verify the effectiveness of the method. Furthermore, the role of global transition tokens and the effect of duration control are explored. Examples of experiments can be found at https://hairuo55.github.io/SingingTacotron. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2202.07907v1-abstract-full').style.display = 'none'; document.getElementById('2202.07907v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 February, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 7 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2111.09801">arXiv:2111.09801</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2111.09801">pdf</a>, <a href="https://arxiv.org/format/2111.09801">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Theoretical Linear Convergence of Deep Unfolding Network for Block-Sparse Signal Recovery </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Fu%2C+R">Rong Fu</a>, <a href="/search/eess?searchtype=author&amp;query=Monardo%2C+V">Vincent Monardo</a>, <a href="/search/eess?searchtype=author&amp;query=Huang%2C+T">Tianyao Huang</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+Y">Yimin Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2111.09801v1-abstract-short" style="display: inline;"> In this paper, we consider the recovery of the high-dimensional block-sparse signal from a compressed set of measurements, where the non-zero coefficients of the recovered signal occur in a small number of blocks. Adopting the idea of deep unfolding, we explore the block-sparse structure and put forward a block-sparse reconstruction network named Ada-BlockLISTA, which performs gradient descent on&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2111.09801v1-abstract-full').style.display = 'inline'; document.getElementById('2111.09801v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2111.09801v1-abstract-full" style="display: none;"> In this paper, we consider the recovery of the high-dimensional block-sparse signal from a compressed set of measurements, where the non-zero coefficients of the recovered signal occur in a small number of blocks. Adopting the idea of deep unfolding, we explore the block-sparse structure and put forward a block-sparse reconstruction network named Ada-BlockLISTA, which performs gradient descent on every single block followed by a block-wise shrinkage. Furthermore, we prove the linear convergence rate of our proposed network, which also theoretically guarantees exact recovery for a potentially higher sparsity level based on underlyingblock structure. Numerical results indicate that Ada-BlockLISTA yields better signal recovery performance compared with existing algorithms, which ignore the additional block structure in the signal model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2111.09801v1-abstract-full').style.display = 'none'; document.getElementById('2111.09801v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 7 figures, 35 conferences</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2111.07589">arXiv:2111.07589</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2111.07589">pdf</a>, <a href="https://arxiv.org/format/2111.07589">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Block-Sparse Recovery Network for Two-Dimensional Harmonic Retrieval </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Fu%2C+R">Rong Fu</a>, <a href="/search/eess?searchtype=author&amp;query=Huang%2C+T">Tianyao Huang</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+L">Lei Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+Y">Yimin Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2111.07589v1-abstract-short" style="display: inline;"> As a typical signal processing problem, multidimensional harmonic retrieval (MHR) has been adapted to a wide range of applications in signal processing. Block-sparse signals, whose nonzero entries appearing in clusters, have received much attention recently. An unfolded network, named Ada-BlockLISTA, was proposed to recover a block-sparse signal at a small computational cost, which learns an indiv&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2111.07589v1-abstract-full').style.display = 'inline'; document.getElementById('2111.07589v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2111.07589v1-abstract-full" style="display: none;"> As a typical signal processing problem, multidimensional harmonic retrieval (MHR) has been adapted to a wide range of applications in signal processing. Block-sparse signals, whose nonzero entries appearing in clusters, have received much attention recently. An unfolded network, named Ada-BlockLISTA, was proposed to recover a block-sparse signal at a small computational cost, which learns an individual weight matrix for each block. However, as the number of network parameters is increasingly associated with the number of blocks, the demand for parameter reduction becomes very significant, especially for large-scale MHR. Based on the dictionary characteristics in two-dimensional (2D) harmonic retrieve problems, we introduce a weight coupling structure to shrink Ada-BlockLISTA, which significantly reduces the number of weights without performance degradation. In simulations, our proposed block-sparse reconstruction network, named AdaBLISTA-CP, shows excellent recovery performance and convergence speed in 2D harmonic retrieval problems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2111.07589v1-abstract-full').style.display = 'none'; document.getElementById('2111.07589v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">2 pages, 2 figures, 13 conferences</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2108.00334">arXiv:2108.00334</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2108.00334">pdf</a>, <a href="https://arxiv.org/format/2108.00334">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Statistical Mechanics">cond-mat.stat-mech</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1103/PhysRevE.104.044101">10.1103/PhysRevE.104.044101 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Energy harvesting from anisotropic fluctuations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Miangolarra%2C+O+M">Olga Movilla Miangolarra</a>, <a href="/search/eess?searchtype=author&amp;query=Taghvaei%2C+A">Amirhossein Taghvaei</a>, <a href="/search/eess?searchtype=author&amp;query=Fu%2C+R">Rui Fu</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+Y">Yongxin Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Georgiou%2C+T+T">Tryphon T. Georgiou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2108.00334v1-abstract-short" style="display: inline;"> We consider a rudimentary model for a heat engine, known as the Brownian gyrator, that consists of an overdamped system with two degrees of freedom in an anisotropic temperature field. Whereas the hallmark of the gyrator is a nonequilibrium steady-state curl-carrying probability current that can generate torque, we explore the coupling of this natural gyrating motion with a periodic actuation pote&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2108.00334v1-abstract-full').style.display = 'inline'; document.getElementById('2108.00334v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2108.00334v1-abstract-full" style="display: none;"> We consider a rudimentary model for a heat engine, known as the Brownian gyrator, that consists of an overdamped system with two degrees of freedom in an anisotropic temperature field. Whereas the hallmark of the gyrator is a nonequilibrium steady-state curl-carrying probability current that can generate torque, we explore the coupling of this natural gyrating motion with a periodic actuation potential for the purpose of extracting work. We show that path-lengths traversed in the manifold of thermodynamic states, measured in a suitable Riemannian metric, represent dissipative losses, while area integrals of a work-density quantify work being extracted. Thus, the maximal amount of work that can be extracted relates to an isoperimetric problem, trading off area against length of an encircling path. We derive an isoperimetric inequality that provides a universal bound on the efficiency of all cyclic operating protocols, and a bound on how fast a closed path can be traversed before it becomes impossible to extract positive work. The analysis presented provides guiding principles for building autonomous engines that extract work from anistropic fluctuations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2108.00334v1-abstract-full').style.display = 'none'; document.getElementById('2108.00334v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 July, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages 5 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Phys. Rev. E 104, 044101 (2021) </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2104.03617">arXiv:2104.03617</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2104.03617">pdf</a>, <a href="https://arxiv.org/format/2104.03617">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Half-Truth: A Partially Fake Audio Detection Dataset </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Yi%2C+J">Jiangyan Yi</a>, <a href="/search/eess?searchtype=author&amp;query=Bai%2C+Y">Ye Bai</a>, <a href="/search/eess?searchtype=author&amp;query=Tao%2C+J">Jianhua Tao</a>, <a href="/search/eess?searchtype=author&amp;query=Ma%2C+H">Haoxin Ma</a>, <a href="/search/eess?searchtype=author&amp;query=Tian%2C+Z">Zhengkun Tian</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+C">Chenglong Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+T">Tao Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Fu%2C+R">Ruibo Fu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2104.03617v2-abstract-short" style="display: inline;"> Diverse promising datasets have been designed to hold back the development of fake audio detection, such as ASVspoof databases. However, previous datasets ignore an attacking situation, in which the hacker hides some small fake clips in real speech audio. This poses a serious threat since that it is difficult to distinguish the small fake clip from the whole speech utterance. Therefore, this paper&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2104.03617v2-abstract-full').style.display = 'inline'; document.getElementById('2104.03617v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2104.03617v2-abstract-full" style="display: none;"> Diverse promising datasets have been designed to hold back the development of fake audio detection, such as ASVspoof databases. However, previous datasets ignore an attacking situation, in which the hacker hides some small fake clips in real speech audio. This poses a serious threat since that it is difficult to distinguish the small fake clip from the whole speech utterance. Therefore, this paper develops such a dataset for half-truth audio detection (HAD). Partially fake audio in the HAD dataset involves only changing a few words in an utterance.The audio of the words is generated with the very latest state-of-the-art speech synthesis technology. We can not only detect fake uttrances but also localize manipulated regions in a speech using this dataset. Some benchmark results are presented on this dataset. The results show that partially fake audio presents much more challenging than fully fake audio for fake audio detection. The HAD dataset is publicly available: https://zenodo.org/records/10377492. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2104.03617v2-abstract-full').style.display = 'none'; document.getElementById('2104.03617v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 April, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted by Interspeech 2021</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2103.11736">arXiv:2103.11736</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2103.11736">pdf</a>, <a href="https://arxiv.org/format/2103.11736">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Automatic Pulmonary Artery-Vein Separation in CT Images using Twin-Pipe Network and Topology Reconstruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Pan%2C+L">Lin Pan</a>, <a href="/search/eess?searchtype=author&amp;query=Zheng%2C+Y">Yaoyong Zheng</a>, <a href="/search/eess?searchtype=author&amp;query=Huang%2C+L">Liqin Huang</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+L">Liuqing Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+Z">Zhen Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Fu%2C+R">Rongda Fu</a>, <a href="/search/eess?searchtype=author&amp;query=Zheng%2C+B">Bin Zheng</a>, <a href="/search/eess?searchtype=author&amp;query=Zheng%2C+S">Shaohua Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2103.11736v2-abstract-short" style="display: inline;"> With the development of medical computer-aided diagnostic systems, pulmonary artery-vein(A/V) separation plays a crucial role in assisting doctors in preoperative planning for lung cancer surgery. However, distinguishing arterial from venous irrigation in chest CT images remains a challenge due to the similarity and complex structure of the arteries and veins. We propose a novel method for automat&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2103.11736v2-abstract-full').style.display = 'inline'; document.getElementById('2103.11736v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2103.11736v2-abstract-full" style="display: none;"> With the development of medical computer-aided diagnostic systems, pulmonary artery-vein(A/V) separation plays a crucial role in assisting doctors in preoperative planning for lung cancer surgery. However, distinguishing arterial from venous irrigation in chest CT images remains a challenge due to the similarity and complex structure of the arteries and veins. We propose a novel method for automatic separation of pulmonary arteries and veins from chest CT images. The method consists of three parts. First, global connection information and local feature information are used to construct a complete topological tree and ensure the continuity of vessel reconstruction. Second, the Twin-Pipe network proposed can automatically learn the differences between arteries and veins at different levels to reduce classification errors caused by changes in terminal vessel characteristics. Finally, the topology optimizer considers interbranch and intrabranch topological relationships to maintain spatial consistency to avoid the misclassification of A/V irrigations. We validate the performance of the method on chest CT images. Compared with manual classification, the proposed method achieves an average accuracy of 96.2% on noncontrast chest CT. In addition, the method has been proven to have good generalization, that is, the accuracies of 93.8% and 94.8% are obtained for CT scans from other devices and other modes, respectively. The result of pulmonary artery-vein obtained by the proposed method can provide better assistance for preoperative planning of lung cancer surgery. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2103.11736v2-abstract-full').style.display = 'none'; document.getElementById('2103.11736v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 May, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 March, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2103.03986">arXiv:2103.03986</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2103.03986">pdf</a>, <a href="https://arxiv.org/ps/2103.03986">ps</a>, <a href="https://arxiv.org/format/2103.03986">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Statistical Mechanics">cond-mat.stat-mech</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> </div> </div> <p class="title is-5 mathjax"> On the relation between information and power in stochastic thermodynamic engines </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Taghvaei%2C+A">Amirhossein Taghvaei</a>, <a href="/search/eess?searchtype=author&amp;query=Miangolarra%2C+O+M">Olga Movilla Miangolarra</a>, <a href="/search/eess?searchtype=author&amp;query=Fu%2C+R">Rui Fu</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+Y">Yongxin Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Georgiou%2C+T+T">Tryphon T. Georgiou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2103.03986v1-abstract-short" style="display: inline;"> The common saying, that information is power, takes a rigorous form in stochastic thermodynamics, where a quantitative equivalence between the two helps explain the paradox of Maxwell&#39;s demon in its ability to reduce entropy. In the present paper, we build on earlier work on the interplay between the relative cost and benefits of information in producing work in cyclic operation of thermodynamic e&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2103.03986v1-abstract-full').style.display = 'inline'; document.getElementById('2103.03986v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2103.03986v1-abstract-full" style="display: none;"> The common saying, that information is power, takes a rigorous form in stochastic thermodynamics, where a quantitative equivalence between the two helps explain the paradox of Maxwell&#39;s demon in its ability to reduce entropy. In the present paper, we build on earlier work on the interplay between the relative cost and benefits of information in producing work in cyclic operation of thermodynamic engines (by Sandberg etal. 2014). Specifically, we study the general case of overdamped particles in a time-varying potential (control action) in feedback that utilizes continuous measurements (nonlinear filtering) of a thermodynamic ensemble, to produce suitable adaptations of the second law of thermodynamics that involve information. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2103.03986v1-abstract-full').style.display = 'none'; document.getElementById('2103.03986v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 March, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2102.12755">arXiv:2102.12755</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2102.12755">pdf</a>, <a href="https://arxiv.org/format/2102.12755">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Coarse-to-fine Airway Segmentation Using Multi information Fusion Network and CNN-based Region Growing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Guo%2C+J">Jinquan Guo</a>, <a href="/search/eess?searchtype=author&amp;query=Fu%2C+R">Rongda Fu</a>, <a href="/search/eess?searchtype=author&amp;query=Pan%2C+L">Lin Pan</a>, <a href="/search/eess?searchtype=author&amp;query=Zheng%2C+S">Shaohua Zheng</a>, <a href="/search/eess?searchtype=author&amp;query=Huang%2C+L">Liqin Huang</a>, <a href="/search/eess?searchtype=author&amp;query=Zheng%2C+B">Bin Zheng</a>, <a href="/search/eess?searchtype=author&amp;query=He%2C+B">Bingwei He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2102.12755v1-abstract-short" style="display: inline;"> Automatic airway segmentation from chest computed tomography (CT) scans plays an important role in pulmonary disease diagnosis and computer-assisted therapy. However, low contrast at peripheral branches and complex tree-like structures remain as two mainly challenges for airway segmentation. Recent research has illustrated that deep learning methods perform well in segmentation tasks. Motivated by&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2102.12755v1-abstract-full').style.display = 'inline'; document.getElementById('2102.12755v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2102.12755v1-abstract-full" style="display: none;"> Automatic airway segmentation from chest computed tomography (CT) scans plays an important role in pulmonary disease diagnosis and computer-assisted therapy. However, low contrast at peripheral branches and complex tree-like structures remain as two mainly challenges for airway segmentation. Recent research has illustrated that deep learning methods perform well in segmentation tasks. Motivated by these works, a coarse-to-fine segmentation framework is proposed to obtain a complete airway tree. Our framework segments the overall airway and small branches via the multi-information fusion convolution neural network (Mif-CNN) and the CNN-based region growing, respectively. In Mif-CNN, atrous spatial pyramid pooling (ASPP) is integrated into a u-shaped network, and it can expend the receptive field and capture multi-scale information. Meanwhile, boundary and location information are incorporated into semantic information. These information are fused to help Mif-CNN utilize additional context knowledge and useful features. To improve the performance of the segmentation result, the CNN-based region growing method is designed to focus on obtaining small branches. A voxel classification network (VCN), which can entirely capture the rich information around each voxel, is applied to classify the voxels into airway and non-airway. In addition, a shape reconstruction method is used to refine the airway tree. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2102.12755v1-abstract-full').style.display = 'none'; document.getElementById('2102.12755v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 February, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2102.11663">arXiv:2102.11663</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2102.11663">pdf</a>, <a href="https://arxiv.org/ps/2102.11663">ps</a>, <a href="https://arxiv.org/format/2102.11663">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TSP.2021.3086593">10.1109/TSP.2021.3086593 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Structured LISTA for Multidimensional Harmonic Retrieval </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Fu%2C+R">Rong Fu</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+Y">Yimin Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Huang%2C+T">Tianyao Huang</a>, <a href="/search/eess?searchtype=author&amp;query=Eldar%2C+Y+C">Yonina C. Eldar</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2102.11663v1-abstract-short" style="display: inline;"> Learned iterative shrinkage thresholding algorithm (LISTA), which adopts deep learning techniques to learn optimal algorithm parameters from labeled training data, can be successfully applied to small-scale multidimensional harmonic retrieval (MHR) problems. However, LISTA computationally demanding for large-scale MHR problems because the matrix size of the learned mutual inhibition matrix exhibit&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2102.11663v1-abstract-full').style.display = 'inline'; document.getElementById('2102.11663v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2102.11663v1-abstract-full" style="display: none;"> Learned iterative shrinkage thresholding algorithm (LISTA), which adopts deep learning techniques to learn optimal algorithm parameters from labeled training data, can be successfully applied to small-scale multidimensional harmonic retrieval (MHR) problems. However, LISTA computationally demanding for large-scale MHR problems because the matrix size of the learned mutual inhibition matrix exhibits quadratic growth with the signal length. These large matrices consume costly memory/computation resources and require a huge amount of labeled data for training, restricting the applicability of the LISTA method. In this paper, we show that the mutual inhibition matrix of a MHR problem naturally has a Toeplitz structure, which means that the degrees of freedom (DoF) of the matrix can be reduced from a quadratic order to a linear order. By exploiting this characteristic, we propose a structured LISTA-Toeplitz network, which imposes a Toeplitz structure restriction on the mutual inhibition matrices and applies linear convolution instead of the matrix-vector multiplication involved in the traditional LISTA network. Both simulation and field test for air target detection with radar are carried out to validate the performance of the proposed network. For small-scale MHR problems, LISTAToeplitz exhibits close or even better recovery accuracy than traditional LISTA, while the former significantly reduces the network complexity and requires much less training data. For large-scale MHR problems, where LISTA is difficult to implement due to the huge size of the mutual inhibition matrices, our proposed LISTA-Toeplitz still enjoys desirable recovery performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2102.11663v1-abstract-full').style.display = 'none'; document.getElementById('2102.11663v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 February, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages,13 figures, 50 references</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2101.02239">arXiv:2101.02239</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2101.02239">pdf</a>, <a href="https://arxiv.org/ps/2101.02239">ps</a>, <a href="https://arxiv.org/format/2101.02239">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Statistical Mechanics">cond-mat.stat-mech</span> </div> </div> <p class="title is-5 mathjax"> Harvesting energy from a periodic heat bath </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Fu%2C+R">Rui Fu</a>, <a href="/search/eess?searchtype=author&amp;query=Miangolarra%2C+O+M">Olga Movilla Miangolarra</a>, <a href="/search/eess?searchtype=author&amp;query=Taghvaei%2C+A">Amirhossein Taghvaei</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+Y">Yongxin Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Georgiou%2C+T+T">Tryphon T. Georgiou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2101.02239v1-abstract-short" style="display: inline;"> The context of the present paper is stochastic thermodynamics - an approach to nonequilibrium thermodynamics rooted within the broader framework of stochastic control. In contrast to the classical paradigm of Carnot engines, we herein propose to consider thermodynamic processes with periodic continuously varying temperature of a heat bath and study questions of maximal power and efficiency for two&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2101.02239v1-abstract-full').style.display = 'inline'; document.getElementById('2101.02239v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2101.02239v1-abstract-full" style="display: none;"> The context of the present paper is stochastic thermodynamics - an approach to nonequilibrium thermodynamics rooted within the broader framework of stochastic control. In contrast to the classical paradigm of Carnot engines, we herein propose to consider thermodynamic processes with periodic continuously varying temperature of a heat bath and study questions of maximal power and efficiency for two idealized cases, overdamped (first-order) and underdamped (second-order) stochastic models. We highlight properties of optimal periodic control, derive and numerically validate approximate formulae for the optimal performance (power and efficiency). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2101.02239v1-abstract-full').style.display = 'none'; document.getElementById('2101.02239v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 January, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, 2 figures. Conference</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2008.02312">arXiv:2008.02312</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2008.02312">pdf</a>, <a href="https://arxiv.org/format/2008.02312">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Axiom-based Grad-CAM: Towards Accurate Visualization and Explanation of CNNs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Fu%2C+R">Ruigang Fu</a>, <a href="/search/eess?searchtype=author&amp;query=Hu%2C+Q">Qingyong Hu</a>, <a href="/search/eess?searchtype=author&amp;query=Dong%2C+X">Xiaohu Dong</a>, <a href="/search/eess?searchtype=author&amp;query=Guo%2C+Y">Yulan Guo</a>, <a href="/search/eess?searchtype=author&amp;query=Gao%2C+Y">Yinghui Gao</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+B">Biao Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2008.02312v4-abstract-short" style="display: inline;"> To have a better understanding and usage of Convolution Neural Networks (CNNs), the visualization and interpretation of CNNs has attracted increasing attention in recent years. In particular, several Class Activation Mapping (CAM) methods have been proposed to discover the connection between CNN&#39;s decision and image regions. In spite of the reasonable visualization, lack of clear and sufficient th&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2008.02312v4-abstract-full').style.display = 'inline'; document.getElementById('2008.02312v4-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2008.02312v4-abstract-full" style="display: none;"> To have a better understanding and usage of Convolution Neural Networks (CNNs), the visualization and interpretation of CNNs has attracted increasing attention in recent years. In particular, several Class Activation Mapping (CAM) methods have been proposed to discover the connection between CNN&#39;s decision and image regions. In spite of the reasonable visualization, lack of clear and sufficient theoretical support is the main limitation of these methods. In this paper, we introduce two axioms -- Conservation and Sensitivity -- to the visualization paradigm of the CAM methods. Meanwhile, a dedicated Axiom-based Grad-CAM (XGrad-CAM) is proposed to satisfy these axioms as much as possible. Experiments demonstrate that XGrad-CAM is an enhanced version of Grad-CAM in terms of conservation and sensitivity. It is able to achieve better visualization performance than Grad-CAM, while also be class-discriminative and easy-to-implement compared with Grad-CAM++ and Ablation-CAM. The code is available at https://github.com/Fu0511/XGrad-CAM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2008.02312v4-abstract-full').style.display = 'none'; document.getElementById('2008.02312v4-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 August, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 August, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">BMVC 2020 (Oral presentation). Code is avaliable at: https://github.com/Fu0511/XGrad-CAM</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2002.03214">arXiv:2002.03214</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2002.03214">pdf</a>, <a href="https://arxiv.org/format/2002.03214">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> DeepSIC: Deep Soft Interference Cancellation for Multiuser MIMO Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Shlezinger%2C+N">Nir Shlezinger</a>, <a href="/search/eess?searchtype=author&amp;query=Fu%2C+R">Rong Fu</a>, <a href="/search/eess?searchtype=author&amp;query=Eldar%2C+Y+C">Yonina C. Eldar</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2002.03214v2-abstract-short" style="display: inline;"> Digital receivers are required to recover the transmitted symbols from their observed channel output. In multiuser multiple-input multiple-output (MIMO) setups, where multiple symbols are simultaneously transmitted, accurate symbol detection is challenging. A family of algorithms capable of reliably recovering multiple symbols is based on interference cancellation. However, these methods assume th&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2002.03214v2-abstract-full').style.display = 'inline'; document.getElementById('2002.03214v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2002.03214v2-abstract-full" style="display: none;"> Digital receivers are required to recover the transmitted symbols from their observed channel output. In multiuser multiple-input multiple-output (MIMO) setups, where multiple symbols are simultaneously transmitted, accurate symbol detection is challenging. A family of algorithms capable of reliably recovering multiple symbols is based on interference cancellation. However, these methods assume that the channel is linear, a model which does not reflect many relevant channels, as well as require accurate channel state information (CSI), which may not be available. In this work we propose a multiuser MIMO receiver which learns to jointly detect in a data-driven fashion, without assuming a specific channel model or requiring CSI. In particular, we propose a data-driven implementation of the iterative soft interference cancellation (SIC) algorithm which we refer to as DeepSIC. The resulting symbol detector is based on integrating dedicated machine-learning (ML) methods into the iterative SIC algorithm. DeepSIC learns to carry out joint detection from a limited set of training samples without requiring the channel to be linear and its parameters to be known. Our numerical evaluations demonstrate that for linear channels with full CSI, DeepSIC approaches the performance of iterative SIC, which is comparable to the optimal performance, and outperforms previously proposed ML-based MIMO receivers. Furthermore, in the presence of CSI uncertainty, DeepSIC significantly outperforms model-based approaches. Finally, we show that DeepSIC accurately detects symbols in non-linear channels, where conventional iterative SIC fails even when accurate CSI is available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2002.03214v2-abstract-full').style.display = 'none'; document.getElementById('2002.03214v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 June, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 February, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">arXiv admin note: text overlap with arXiv:2002.07806</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2001.00979">arXiv:2001.00979</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2001.00979">pdf</a>, <a href="https://arxiv.org/format/2001.00979">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Mathematical Physics">math-ph</span> </div> </div> <p class="title is-5 mathjax"> Maximal power output of a stochastic thermodynamic engine </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Fu%2C+R">Rui Fu</a>, <a href="/search/eess?searchtype=author&amp;query=Taghvaei%2C+A">Amirhossein Taghvaei</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+Y">Yongxin Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Georgiou%2C+T+T">Tryphon T. Georgiou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2001.00979v2-abstract-short" style="display: inline;"> Classical thermodynamics aimed to quantify the efficiency of thermodynamic engines by bounding the maximal amount of mechanical energy produced compared to the amount of heat required. While this was accomplished early on, by Carnot and Clausius, the more practical problem to quantify limits of power that can be delivered, remained elusive due to the fact that quasistatic processes require infinit&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2001.00979v2-abstract-full').style.display = 'inline'; document.getElementById('2001.00979v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2001.00979v2-abstract-full" style="display: none;"> Classical thermodynamics aimed to quantify the efficiency of thermodynamic engines by bounding the maximal amount of mechanical energy produced compared to the amount of heat required. While this was accomplished early on, by Carnot and Clausius, the more practical problem to quantify limits of power that can be delivered, remained elusive due to the fact that quasistatic processes require infinitely slow cycling, resulting in a vanishing power output. Recent insights, drawn from stochastic models, appear to bridge the gap between theory and practice in that they lead to physically meaningful expressions for the dissipation cost in operating a thermodynamic engine over a finite time window. Building on this framework of {\em stochastic thermodynamics} we derive bounds on the maximal power that can be drawn by cycling an overdamped ensemble of particles via a time-varying potential while alternating contact with heat baths of different temperature ($T_c$ cold, and $T_h$ hot). Specifically, assuming a suitable bound $M$ on the spatial gradient of the controlling potential, we show that the maximal achievable power is bounded by $\frac{M}{8}(\frac{T_h}{T_c}-1)$. Moreover, we show that this bound can be reached to within a factor of $(\frac{T_h}{T_c}-1)/(\frac{T_h}{T_c}+1)$ by operating the cyclic thermodynamic process with a quadratic potential. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2001.00979v2-abstract-full').style.display = 'none'; document.getElementById('2001.00979v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 January, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 January, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">24 pages, 1 figure, 1 table</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 49-XX; 60-XX </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1904.11419">arXiv:1904.11419</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1904.11419">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Time Series Simulation by Conditional Generative Adversarial Net </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Fu%2C+R">Rao Fu</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+J">Jie Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Zeng%2C+S">Shutian Zeng</a>, <a href="/search/eess?searchtype=author&amp;query=Zhuang%2C+Y">Yiping Zhuang</a>, <a href="/search/eess?searchtype=author&amp;query=Sudjianto%2C+A">Agus Sudjianto</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1904.11419v1-abstract-short" style="display: inline;"> Generative Adversarial Net (GAN) has been proven to be a powerful machine learning tool in image data analysis and generation. In this paper, we propose to use Conditional Generative Adversarial Net (CGAN) to learn and simulate time series data. The conditions can be both categorical and continuous variables containing different kinds of auxiliary information. Our simulation studies show that CGAN&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1904.11419v1-abstract-full').style.display = 'inline'; document.getElementById('1904.11419v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1904.11419v1-abstract-full" style="display: none;"> Generative Adversarial Net (GAN) has been proven to be a powerful machine learning tool in image data analysis and generation. In this paper, we propose to use Conditional Generative Adversarial Net (CGAN) to learn and simulate time series data. The conditions can be both categorical and continuous variables containing different kinds of auxiliary information. Our simulation studies show that CGAN is able to learn different kinds of normal and heavy tail distributions, as well as dependent structures of different time series and it can further generate conditional predictive distributions consistent with the training data distributions. We also provide an in-depth discussion on the rationale of GAN and the neural network as hierarchical splines to draw a clear connection with the existing statistical method for distribution generation. In practice, CGAN has a wide range of applications in the market risk and counterparty risk analysis: it can be applied to learn the historical data and generate scenarios for the calculation of Value-at-Risk (VaR) and Expected Shortfall (ES) and predict the movement of the market risk factors. We present a real data analysis including a backtesting to demonstrate CGAN is able to outperform the Historic Simulation, a popular method in market risk analysis for the calculation of VaR. CGAN can also be applied in the economic time series modeling and forecasting, and an example of hypothetical shock analysis for economic models and the generation of potential CCAR scenarios by CGAN is given at the end of the paper. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1904.11419v1-abstract-full').style.display = 'none'; document.getElementById('1904.11419v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 April, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2019. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Fu%2C+R&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Fu%2C+R&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Fu%2C+R&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

Pages: 1 2 3 4 5 6 7 8 9 10