CINXE.COM

Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1&ndash;39 of 39 results for author: <span class="mathjax">Deng, C</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> <div class="content"> <form method="GET" action="/search/eess" aria-role="search"> Searching in archive <strong>eess</strong>. <a href="/search/?searchtype=author&amp;query=Deng%2C+C">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Deng, C"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Deng%2C+C&amp;terms-0-field=author&amp;size=50&amp;order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Deng, C"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06156">arXiv:2502.06156</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.06156">pdf</a>, <a href="https://arxiv.org/ps/2502.06156">ps</a>, <a href="https://arxiv.org/format/2502.06156">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="High Energy Physics - Phenomenology">hep-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Axial current as the origin of quantum intrinsic orbital angular momentum </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Amat%2C+O">Orkash Amat</a>, <a href="/search/eess?searchtype=author&amp;query=Nurmamat%2C+N">Nurimangul Nurmamat</a>, <a href="/search/eess?searchtype=author&amp;query=Huang%2C+Y">Yong-Feng Huang</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+C">Cheng-Ming Li</a>, <a href="/search/eess?searchtype=author&amp;query=Geng%2C+J">Jin-Jun Geng</a>, <a href="/search/eess?searchtype=author&amp;query=Hu%2C+C">Chen-Ran Hu</a>, <a href="/search/eess?searchtype=author&amp;query=Zou%2C+Z">Ze-Cheng Zou</a>, <a href="/search/eess?searchtype=author&amp;query=Dong%2C+X">Xiao-Fei Dong</a>, <a href="/search/eess?searchtype=author&amp;query=Deng%2C+C">Chen Deng</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+F">Fan Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+X">Xiao-li Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Du%2C+C">Chen Du</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06156v1-abstract-short" style="display: inline;"> We show that it is impossible to experimentally observe the quantum intrinsic orbital angular momentum (IOAM) effect without its axial current. Broadly speaking, we argue that the spiral or interference characteristics of the axial current density determine the occurrence of nonlinear or tunneling effects in any spacetimedependent quantum systems. Our findings offer a comprehensive theoretical fra&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06156v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06156v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06156v1-abstract-full" style="display: none;"> We show that it is impossible to experimentally observe the quantum intrinsic orbital angular momentum (IOAM) effect without its axial current. Broadly speaking, we argue that the spiral or interference characteristics of the axial current density determine the occurrence of nonlinear or tunneling effects in any spacetimedependent quantum systems. Our findings offer a comprehensive theoretical framework that addresses the limitations of Keldysh theory and provides new insights into the angular momentum properties of quantum systems, particularly in tunneling-dominated regimes. Using Wigner function methods, fermionic generalized two-level model, and Berry phase simulations, we predict that IOAM effect can persist even in pure quantum tunneling processes. These results open the door for experimental verification of IOAM effects in future high-intensity QED experiments, such as those using X-ray free electron lasers. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06156v1-abstract-full').style.display = 'none'; document.getElementById('2502.06156v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 2 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.06282">arXiv:2501.06282</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2501.06282">pdf</a>, <a href="https://arxiv.org/format/2501.06282">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> MinMo: A Multimodal Large Language Model for Seamless Voice Interaction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Chen%2C+Q">Qian Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+Y">Yafeng Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+Y">Yanni Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+M">Mengzhe Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+Y">Yingda Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Deng%2C+C">Chong Deng</a>, <a href="/search/eess?searchtype=author&amp;query=Du%2C+Z">Zhihao Du</a>, <a href="/search/eess?searchtype=author&amp;query=Gao%2C+R">Ruize Gao</a>, <a href="/search/eess?searchtype=author&amp;query=Gao%2C+C">Changfeng Gao</a>, <a href="/search/eess?searchtype=author&amp;query=Gao%2C+Z">Zhifu Gao</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+Y">Yabin Li</a>, <a href="/search/eess?searchtype=author&amp;query=Lv%2C+X">Xiang Lv</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+J">Jiaqing Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Luo%2C+H">Haoneng Luo</a>, <a href="/search/eess?searchtype=author&amp;query=Ma%2C+B">Bin Ma</a>, <a href="/search/eess?searchtype=author&amp;query=Ni%2C+C">Chongjia Ni</a>, <a href="/search/eess?searchtype=author&amp;query=Shi%2C+X">Xian Shi</a>, <a href="/search/eess?searchtype=author&amp;query=Tang%2C+J">Jialong Tang</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+H">Hui Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+H">Hao Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+W">Wen Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+Y">Yuxuan Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+Y">Yunlan Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Yu%2C+F">Fan Yu</a>, <a href="/search/eess?searchtype=author&amp;query=Yan%2C+Z">Zhijie Yan</a> , et al. (11 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.06282v1-abstract-short" style="display: inline;"> Recent advancements in large language models (LLMs) and multimodal speech-text models have laid the groundwork for seamless voice interactions, enabling real-time, natural, and human-like conversations. Previous models for voice interactions are categorized as native and aligned. Native models integrate speech and text processing in one framework but struggle with issues like differing sequence le&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06282v1-abstract-full').style.display = 'inline'; document.getElementById('2501.06282v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.06282v1-abstract-full" style="display: none;"> Recent advancements in large language models (LLMs) and multimodal speech-text models have laid the groundwork for seamless voice interactions, enabling real-time, natural, and human-like conversations. Previous models for voice interactions are categorized as native and aligned. Native models integrate speech and text processing in one framework but struggle with issues like differing sequence lengths and insufficient pre-training. Aligned models maintain text LLM capabilities but are often limited by small datasets and a narrow focus on speech tasks. In this work, we introduce MinMo, a Multimodal Large Language Model with approximately 8B parameters for seamless voice interaction. We address the main limitations of prior aligned multimodal models. We train MinMo through multiple stages of speech-to-text alignment, text-to-speech alignment, speech-to-speech alignment, and duplex interaction alignment, on 1.4 million hours of diverse speech data and a broad range of speech tasks. After the multi-stage training, MinMo achieves state-of-the-art performance across various benchmarks for voice comprehension and generation while maintaining the capabilities of text LLMs, and also facilitates full-duplex conversation, that is, simultaneous two-way communication between the user and the system. Moreover, we propose a novel and simple voice decoder that outperforms prior models in voice generation. The enhanced instruction-following capabilities of MinMo supports controlling speech generation based on user instructions, with various nuances including emotions, dialects, and speaking rates, and mimicking specific voices. For MinMo, the speech-to-text latency is approximately 100ms, full-duplex latency is approximately 600ms in theory and 800ms in practice. The MinMo project web page is https://funaudiollm.github.io/minmo, and the code and models will be released soon. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06282v1-abstract-full').style.display = 'none'; document.getElementById('2501.06282v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Work in progress. Authors are listed in alphabetical order by family name</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.10117">arXiv:2412.10117</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.10117">pdf</a>, <a href="https://arxiv.org/format/2412.10117">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> CosyVoice 2: Scalable Streaming Speech Synthesis with Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Du%2C+Z">Zhihao Du</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+Y">Yuxuan Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+Q">Qian Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Shi%2C+X">Xian Shi</a>, <a href="/search/eess?searchtype=author&amp;query=Lv%2C+X">Xiang Lv</a>, <a href="/search/eess?searchtype=author&amp;query=Zhao%2C+T">Tianyu Zhao</a>, <a href="/search/eess?searchtype=author&amp;query=Gao%2C+Z">Zhifu Gao</a>, <a href="/search/eess?searchtype=author&amp;query=Yang%2C+Y">Yexin Yang</a>, <a href="/search/eess?searchtype=author&amp;query=Gao%2C+C">Changfeng Gao</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+H">Hui Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Yu%2C+F">Fan Yu</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+H">Huadai Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Sheng%2C+Z">Zhengyan Sheng</a>, <a href="/search/eess?searchtype=author&amp;query=Gu%2C+Y">Yue Gu</a>, <a href="/search/eess?searchtype=author&amp;query=Deng%2C+C">Chong Deng</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+W">Wen Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+S">Shiliang Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Yan%2C+Z">Zhijie Yan</a>, <a href="/search/eess?searchtype=author&amp;query=Zhou%2C+J">Jingren Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.10117v3-abstract-short" style="display: inline;"> In our previous work, we introduced CosyVoice, a multilingual speech synthesis model based on supervised discrete speech tokens. By employing progressive semantic decoding with two popular generative models, language models (LMs) and Flow Matching, CosyVoice demonstrated high prosody naturalness, content consistency, and speaker similarity in speech in-context learning. Recently, significant progr&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.10117v3-abstract-full').style.display = 'inline'; document.getElementById('2412.10117v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.10117v3-abstract-full" style="display: none;"> In our previous work, we introduced CosyVoice, a multilingual speech synthesis model based on supervised discrete speech tokens. By employing progressive semantic decoding with two popular generative models, language models (LMs) and Flow Matching, CosyVoice demonstrated high prosody naturalness, content consistency, and speaker similarity in speech in-context learning. Recently, significant progress has been made in multi-modal large language models (LLMs), where the response latency and real-time factor of speech synthesis play a crucial role in the interactive experience. Therefore, in this report, we present an improved streaming speech synthesis model, CosyVoice 2, which incorporates comprehensive and systematic optimizations. Specifically, we introduce finite-scalar quantization to improve the codebook utilization of speech tokens. For the text-speech LM, we streamline the model architecture to allow direct use of a pre-trained LLM as the backbone. In addition, we develop a chunk-aware causal flow matching model to support various synthesis scenarios, enabling both streaming and non-streaming synthesis within a single model. By training on a large-scale multilingual dataset, CosyVoice 2 achieves human-parity naturalness, minimal response latency, and virtually lossless synthesis quality in the streaming mode. We invite readers to listen to the demos at https://funaudiollm.github.io/cosyvoice2. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.10117v3-abstract-full').style.display = 'none'; document.getElementById('2412.10117v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Tech report, work in progress</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.17799">arXiv:2410.17799</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.17799">pdf</a>, <a href="https://arxiv.org/format/2410.17799">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> OmniFlatten: An End-to-end GPT Model for Seamless Voice Conversation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+Q">Qinglin Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Cheng%2C+L">Luyao Cheng</a>, <a href="/search/eess?searchtype=author&amp;query=Deng%2C+C">Chong Deng</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+Q">Qian Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+W">Wen Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Zheng%2C+S">Siqi Zheng</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+J">Jiaqing Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Yu%2C+H">Hai Yu</a>, <a href="/search/eess?searchtype=author&amp;query=Tan%2C+C">Chaohong Tan</a>, <a href="/search/eess?searchtype=author&amp;query=Du%2C+Z">Zhihao Du</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+S">Shiliang Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.17799v2-abstract-short" style="display: inline;"> Full-duplex spoken dialogue systems significantly surpass traditional turn-based dialogue systems, as they allow simultaneous bidirectional communication, closely mirroring human-human interactions. However, achieving low latency and natural interactions in full-duplex dialogue systems remains a significant challenge, especially considering human conversation dynamics such as interruptions, backch&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17799v2-abstract-full').style.display = 'inline'; document.getElementById('2410.17799v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.17799v2-abstract-full" style="display: none;"> Full-duplex spoken dialogue systems significantly surpass traditional turn-based dialogue systems, as they allow simultaneous bidirectional communication, closely mirroring human-human interactions. However, achieving low latency and natural interactions in full-duplex dialogue systems remains a significant challenge, especially considering human conversation dynamics such as interruptions, backchannels, and overlapping speech. In this paper, we introduce a novel End-to-End GPT-based model OmniFlatten for full-duplex conversation, capable of effectively modeling the complex behaviors inherent to natural conversations with low latency. To achieve full-duplex conversation capabilities, we propose a multi-stage post-training scheme that progressively adapts a text large language model (LLM) backbone into a speech-text dialogue LLM, capable of generating text and speech in real time, without modifying the architecture of the backbone LLM. The training process comprises three stages: modality alignment, half-duplex dialogue learning, and full-duplex dialogue learning. In all training stages, we standardize the data using a flattening operation, which enables unifying the training methods and the GPT backbone across different modalities and tasks. Our approach offers a simple modeling technique and a promising research direction for developing efficient and natural end-to-end full-duplex spoken dialogue systems. Audio samples of dialogues generated by OmniFlatten can be found at this web site (https://omniflatten.github.io/). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17799v2-abstract-full').style.display = 'none'; document.getElementById('2410.17799v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Work in progress</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.13292">arXiv:2409.13292</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.13292">pdf</a>, <a href="https://arxiv.org/format/2409.13292">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Exploring Text-Queried Sound Event Detection with Audio Source Separation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Yin%2C+H">Han Yin</a>, <a href="/search/eess?searchtype=author&amp;query=Bai%2C+J">Jisheng Bai</a>, <a href="/search/eess?searchtype=author&amp;query=Xiao%2C+Y">Yang Xiao</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+H">Hui Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Zheng%2C+S">Siqi Zheng</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+Y">Yafeng Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Das%2C+R+K">Rohan Kumar Das</a>, <a href="/search/eess?searchtype=author&amp;query=Deng%2C+C">Chong Deng</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+J">Jianfeng Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.13292v2-abstract-short" style="display: inline;"> In sound event detection (SED), overlapping sound events pose a significant challenge, as certain events can be easily masked by background noise or other events, resulting in poor detection performance. To address this issue, we propose the text-queried SED (TQ-SED) framework. Specifically, we first pre-train a language-queried audio source separation (LASS) model to separate the audio tracks cor&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13292v2-abstract-full').style.display = 'inline'; document.getElementById('2409.13292v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.13292v2-abstract-full" style="display: none;"> In sound event detection (SED), overlapping sound events pose a significant challenge, as certain events can be easily masked by background noise or other events, resulting in poor detection performance. To address this issue, we propose the text-queried SED (TQ-SED) framework. Specifically, we first pre-train a language-queried audio source separation (LASS) model to separate the audio tracks corresponding to different events from the input audio. Then, multiple target SED branches are employed to detect individual events. AudioSep is a state-of-the-art LASS model, but has limitations in extracting dynamic audio information because of its pure convolutional structure for separation. To address this, we integrate a dual-path recurrent neural network block into the model. We refer to this structure as AudioSep-DP, which achieves the first place in DCASE 2024 Task 9 on language-queried audio source separation (objective single model track). Experimental results show that TQ-SED can significantly improve the SED performance, with an improvement of 7.22\% on F1 score over the conventional framework. Additionally, we setup comprehensive experiments to explore the impact of model complexity. The source code and pre-trained model are released at https://github.com/apple-yinhan/TQ-SED. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13292v2-abstract-full').style.display = 'none'; document.getElementById('2409.13292v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICASSP 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.00365">arXiv:2408.00365</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.00365">pdf</a>, <a href="https://arxiv.org/format/2408.00365">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Multimodal Fusion and Coherence Modeling for Video Topic Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Yu%2C+H">Hai Yu</a>, <a href="/search/eess?searchtype=author&amp;query=Deng%2C+C">Chong Deng</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+Q">Qinglin Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+J">Jiaqing Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+Q">Qian Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+W">Wen Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.00365v2-abstract-short" style="display: inline;"> The video topic segmentation (VTS) task segments videos into intelligible, non-overlapping topics, facilitating efficient comprehension of video content and quick access to specific content. VTS is also critical to various downstream video understanding tasks. Traditional VTS methods using shallow features or unsupervised approaches struggle to accurately discern the nuances of topical transitions&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.00365v2-abstract-full').style.display = 'inline'; document.getElementById('2408.00365v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.00365v2-abstract-full" style="display: none;"> The video topic segmentation (VTS) task segments videos into intelligible, non-overlapping topics, facilitating efficient comprehension of video content and quick access to specific content. VTS is also critical to various downstream video understanding tasks. Traditional VTS methods using shallow features or unsupervised approaches struggle to accurately discern the nuances of topical transitions. Recently, supervised approaches have achieved superior performance on video action or scene segmentation over unsupervised approaches. In this work, we improve supervised VTS by thoroughly exploring multimodal fusion and multimodal coherence modeling. Specifically, (1) we enhance multimodal fusion by exploring different architectures using cross-attention and mixture of experts. (2) To generally strengthen multimodality alignment and fusion, we pre-train and fine-tune the model with multimodal contrastive learning. (3) We propose a new pre-training task tailored for the VTS task, and a novel fine-tuning task for enhancing multimodal coherence modeling for VTS. We evaluate the proposed approaches on educational videos, in the form of lectures, due to the vital role of topic segmentation of educational videos in boosting learning experiences. Additionally, we introduce a large-scale Chinese lecture video dataset to augment the existing English corpus, promoting further research in VTS. Experiments on both English and Chinese lecture datasets demonstrate that our model achieves superior VTS performance compared to competitive unsupervised and supervised baselines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.00365v2-abstract-full').style.display = 'none'; document.getElementById('2408.00365v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.04051">arXiv:2407.04051</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.04051">pdf</a>, <a href="https://arxiv.org/format/2407.04051">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> FunAudioLLM: Voice Understanding and Generation Foundation Models for Natural Interaction Between Humans and LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=An%2C+K">Keyu An</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+Q">Qian Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Deng%2C+C">Chong Deng</a>, <a href="/search/eess?searchtype=author&amp;query=Du%2C+Z">Zhihao Du</a>, <a href="/search/eess?searchtype=author&amp;query=Gao%2C+C">Changfeng Gao</a>, <a href="/search/eess?searchtype=author&amp;query=Gao%2C+Z">Zhifu Gao</a>, <a href="/search/eess?searchtype=author&amp;query=Gu%2C+Y">Yue Gu</a>, <a href="/search/eess?searchtype=author&amp;query=He%2C+T">Ting He</a>, <a href="/search/eess?searchtype=author&amp;query=Hu%2C+H">Hangrui Hu</a>, <a href="/search/eess?searchtype=author&amp;query=Hu%2C+K">Kai Hu</a>, <a href="/search/eess?searchtype=author&amp;query=Ji%2C+S">Shengpeng Ji</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+Y">Yabin Li</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+Z">Zerui Li</a>, <a href="/search/eess?searchtype=author&amp;query=Lu%2C+H">Heng Lu</a>, <a href="/search/eess?searchtype=author&amp;query=Luo%2C+H">Haoneng Luo</a>, <a href="/search/eess?searchtype=author&amp;query=Lv%2C+X">Xiang Lv</a>, <a href="/search/eess?searchtype=author&amp;query=Ma%2C+B">Bin Ma</a>, <a href="/search/eess?searchtype=author&amp;query=Ma%2C+Z">Ziyang Ma</a>, <a href="/search/eess?searchtype=author&amp;query=Ni%2C+C">Chongjia Ni</a>, <a href="/search/eess?searchtype=author&amp;query=Song%2C+C">Changhe Song</a>, <a href="/search/eess?searchtype=author&amp;query=Shi%2C+J">Jiaqi Shi</a>, <a href="/search/eess?searchtype=author&amp;query=Shi%2C+X">Xian Shi</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+H">Hao Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+W">Wen Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+Y">Yuxuan Wang</a> , et al. (8 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.04051v3-abstract-short" style="display: inline;"> This report introduces FunAudioLLM, a model family designed to enhance natural voice interactions between humans and large language models (LLMs). At its core are two innovative models: SenseVoice, which handles multilingual speech recognition, emotion recognition, and audio event detection; and CosyVoice, which facilitates natural speech generation with control over multiple languages, timbre, sp&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.04051v3-abstract-full').style.display = 'inline'; document.getElementById('2407.04051v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.04051v3-abstract-full" style="display: none;"> This report introduces FunAudioLLM, a model family designed to enhance natural voice interactions between humans and large language models (LLMs). At its core are two innovative models: SenseVoice, which handles multilingual speech recognition, emotion recognition, and audio event detection; and CosyVoice, which facilitates natural speech generation with control over multiple languages, timbre, speaking style, and speaker identity. SenseVoice-Small delivers exceptionally low-latency ASR for 5 languages, and SenseVoice-Large supports high-precision ASR for over 50 languages, while CosyVoice excels in multi-lingual voice generation, zero-shot in-context learning, cross-lingual voice cloning, and instruction-following capabilities. The models related to SenseVoice and CosyVoice have been open-sourced on Modelscope and Huggingface, along with the corresponding training, inference, and fine-tuning codes released on GitHub. By integrating these models with LLMs, FunAudioLLM enables applications such as speech-to-speech translation, emotional voice chat, interactive podcasts, and expressive audiobook narration, thereby pushing the boundaries of voice interaction technology. Demos are available at https://fun-audio-llm.github.io, and the code can be accessed at https://github.com/FunAudioLLM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.04051v3-abstract-full').style.display = 'none'; document.getElementById('2407.04051v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Work in progress. Authors are listed in alphabetical order by family name</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.09444">arXiv:2406.09444</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.09444">pdf</a>, <a href="https://arxiv.org/format/2406.09444">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> GenDistiller: Distilling Pre-trained Language Models based on an Autoregressive Generative Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Gao%2C+Y">Yingying Gao</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+S">Shilei Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Deng%2C+C">Chao Deng</a>, <a href="/search/eess?searchtype=author&amp;query=Feng%2C+J">Junlan Feng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.09444v2-abstract-short" style="display: inline;"> Pre-trained speech language models such as HuBERT and WavLM leverage unlabeled speech data for self-supervised learning and offer powerful representations for numerous downstream tasks. Despite the success of these models, their high requirements for memory and computing resource hinder their application on resource restricted devices. Therefore, this paper introduces GenDistiller, a novel knowled&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.09444v2-abstract-full').style.display = 'inline'; document.getElementById('2406.09444v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.09444v2-abstract-full" style="display: none;"> Pre-trained speech language models such as HuBERT and WavLM leverage unlabeled speech data for self-supervised learning and offer powerful representations for numerous downstream tasks. Despite the success of these models, their high requirements for memory and computing resource hinder their application on resource restricted devices. Therefore, this paper introduces GenDistiller, a novel knowledge distillation framework which generates the hidden representations of the pre-trained teacher model directly by a much smaller student network. The proposed method takes the previous hidden layer as history and implements a layer-by-layer prediction of the teacher model autoregressively. Experiments on SUPERB reveal the advantage of GenDistiller over the baseline distilling method without an autoregressive framework, with 33% fewer parameters, similar time consumption and better performance on most of the SUPERB tasks. Ultimately, the proposed GenDistiller reduces the size of WavLM by 82%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.09444v2-abstract-full').style.display = 'none'; document.getElementById('2406.09444v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">arXiv admin note: text overlap with arXiv:2310.13418</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.07801">arXiv:2406.07801</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.07801">pdf</a>, <a href="https://arxiv.org/format/2406.07801">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> PolySpeech: Exploring Unified Multitask Speech Models for Competitiveness with Single-task Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Yang%2C+R">Runyan Yang</a>, <a href="/search/eess?searchtype=author&amp;query=Yang%2C+H">Huibao Yang</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+X">Xiqing Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Ye%2C+T">Tiantian Ye</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+Y">Ying Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Gao%2C+Y">Yingying Gao</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+S">Shilei Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Deng%2C+C">Chao Deng</a>, <a href="/search/eess?searchtype=author&amp;query=Feng%2C+J">Junlan Feng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.07801v1-abstract-short" style="display: inline;"> Recently, there have been attempts to integrate various speech processing tasks into a unified model. However, few previous works directly demonstrated that joint optimization of diverse tasks in multitask speech models has positive influence on the performance of individual tasks. In this paper we present a multitask speech model -- PolySpeech, which supports speech recognition, speech synthesis,&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.07801v1-abstract-full').style.display = 'inline'; document.getElementById('2406.07801v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.07801v1-abstract-full" style="display: none;"> Recently, there have been attempts to integrate various speech processing tasks into a unified model. However, few previous works directly demonstrated that joint optimization of diverse tasks in multitask speech models has positive influence on the performance of individual tasks. In this paper we present a multitask speech model -- PolySpeech, which supports speech recognition, speech synthesis, and two speech classification tasks. PolySpeech takes multi-modal language model as its core structure and uses semantic representations as speech inputs. We introduce semantic speech embedding tokenization and speech reconstruction methods to PolySpeech, enabling efficient generation of high-quality speech for any given speaker. PolySpeech shows competitiveness across various tasks compared to single-task models. In our experiments, multitask optimization achieves performance comparable to single-task optimization and is especially beneficial for specific tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.07801v1-abstract-full').style.display = 'none'; document.getElementById('2406.07801v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 2 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.10463">arXiv:2405.10463</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2405.10463">pdf</a>, <a href="https://arxiv.org/format/2405.10463">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Optics">physics.optics</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Biological Physics">physics.bio-ph</span> </div> </div> <p class="title is-5 mathjax"> Single-shot volumetric fluorescence imaging with neural fields </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+O">Oumeng Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Zhou%2C+H">Haowen Zhou</a>, <a href="/search/eess?searchtype=author&amp;query=Feng%2C+B+Y">Brandon Y. Feng</a>, <a href="/search/eess?searchtype=author&amp;query=Larsson%2C+E+M">Elin M. Larsson</a>, <a href="/search/eess?searchtype=author&amp;query=Alcalde%2C+R+E">Reinaldo E. Alcalde</a>, <a href="/search/eess?searchtype=author&amp;query=Yin%2C+S">Siyuan Yin</a>, <a href="/search/eess?searchtype=author&amp;query=Deng%2C+C">Catherine Deng</a>, <a href="/search/eess?searchtype=author&amp;query=Yang%2C+C">Changhuei Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.10463v3-abstract-short" style="display: inline;"> Single-shot volumetric fluorescence (SVF) imaging offers a significant advantage over traditional imaging methods that require scanning across multiple axial planes as it can capture biological processes with high temporal resolution. The key challenges in SVF imaging include requiring sparsity constraints, eliminating depth ambiguity in the reconstruction, and maintaining high resolution across a&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.10463v3-abstract-full').style.display = 'inline'; document.getElementById('2405.10463v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.10463v3-abstract-full" style="display: none;"> Single-shot volumetric fluorescence (SVF) imaging offers a significant advantage over traditional imaging methods that require scanning across multiple axial planes as it can capture biological processes with high temporal resolution. The key challenges in SVF imaging include requiring sparsity constraints, eliminating depth ambiguity in the reconstruction, and maintaining high resolution across a large field of view. In this paper, we introduce the QuadraPol point spread function (PSF) combined with neural fields, a novel approach for SVF imaging. This method utilizes a custom polarizer at the back focal plane and a polarization camera to detect fluorescence, effectively encoding the 3D scene within a compact PSF without depth ambiguity. Additionally, we propose a reconstruction algorithm based on the neural fields technique that provides improved reconstruction quality compared to classical deconvolution methods. QuadraPol PSF, combined with neural fields, significantly reduces the acquisition time of a conventional fluorescence microscope by approximately 20 times and captures a 100 mm$^3$ cubic volume in one shot. We validate the effectiveness of both our hardware and algorithm through all-in-focus imaging of bacterial colonies on sand surfaces and visualization of plant root morphology. Our approach offers a powerful tool for advancing biological research and ecological studies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.10463v3-abstract-full').style.display = 'none'; document.getElementById('2405.10463v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.19971">arXiv:2403.19971</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2403.19971">pdf</a>, <a href="https://arxiv.org/format/2403.19971">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> 3D-Speaker-Toolkit: An Open-Source Toolkit for Multimodal Speaker Verification and Diarization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Chen%2C+Y">Yafeng Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Zheng%2C+S">Siqi Zheng</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+H">Hui Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Cheng%2C+L">Luyao Cheng</a>, <a href="/search/eess?searchtype=author&amp;query=Zhu%2C+T">Tinglong Zhu</a>, <a href="/search/eess?searchtype=author&amp;query=Huang%2C+R">Rongjie Huang</a>, <a href="/search/eess?searchtype=author&amp;query=Deng%2C+C">Chong Deng</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+Q">Qian Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+S">Shiliang Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+W">Wen Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+X">Xihao Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.19971v3-abstract-short" style="display: inline;"> We introduce 3D-Speaker-Toolkit, an open-source toolkit for multimodal speaker verification and diarization, designed for meeting the needs of academic researchers and industrial practitioners. The 3D-Speaker-Toolkit adeptly leverages the combined strengths of acoustic, semantic, and visual data, seamlessly fusing these modalities to offer robust speaker recognition capabilities. The acoustic modu&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.19971v3-abstract-full').style.display = 'inline'; document.getElementById('2403.19971v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.19971v3-abstract-full" style="display: none;"> We introduce 3D-Speaker-Toolkit, an open-source toolkit for multimodal speaker verification and diarization, designed for meeting the needs of academic researchers and industrial practitioners. The 3D-Speaker-Toolkit adeptly leverages the combined strengths of acoustic, semantic, and visual data, seamlessly fusing these modalities to offer robust speaker recognition capabilities. The acoustic module extracts speaker embeddings from acoustic features, employing both fully-supervised and self-supervised learning approaches. The semantic module leverages advanced language models to comprehend the substance and context of spoken language, thereby augmenting the system&#39;s proficiency in distinguishing speakers through linguistic patterns. The visual module applies image processing technologies to scrutinize facial features, which bolsters the precision of speaker diarization in multi-speaker environments. Collectively, these modules empower the 3D-Speaker-Toolkit to achieve substantially improved accuracy and reliability in speaker-related tasks. With 3D-Speaker-Toolkit, we establish a new benchmark for multimodal speaker analysis. The toolkit also includes a handful of open-source state-of-the-art models and a large-scale dataset containing over 10,000 speakers. The toolkit is publicly available at https://github.com/modelscope/3D-Speaker. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.19971v3-abstract-full').style.display = 'none'; document.getElementById('2403.19971v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.12746">arXiv:2402.12746</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2402.12746">pdf</a>, <a href="https://arxiv.org/ps/2402.12746">ps</a>, <a href="https://arxiv.org/format/2402.12746">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Plugin Speech Enhancement: A Universal Speech Enhancement Framework Inspired by Dynamic Neural Network </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Chen%2C+Y">Yanan Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Cui%2C+Z">Zihao Cui</a>, <a href="/search/eess?searchtype=author&amp;query=Gao%2C+Y">Yingying Gao</a>, <a href="/search/eess?searchtype=author&amp;query=Feng%2C+J">Junlan Feng</a>, <a href="/search/eess?searchtype=author&amp;query=Deng%2C+C">Chao Deng</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+S">Shilei Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.12746v1-abstract-short" style="display: inline;"> The expectation to deploy a universal neural network for speech enhancement, with the aim of improving noise robustness across diverse speech processing tasks, faces challenges due to the existing lack of awareness within static speech enhancement frameworks regarding the expected speech in downstream modules. These limitations impede the effectiveness of static speech enhancement approaches in ac&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.12746v1-abstract-full').style.display = 'inline'; document.getElementById('2402.12746v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.12746v1-abstract-full" style="display: none;"> The expectation to deploy a universal neural network for speech enhancement, with the aim of improving noise robustness across diverse speech processing tasks, faces challenges due to the existing lack of awareness within static speech enhancement frameworks regarding the expected speech in downstream modules. These limitations impede the effectiveness of static speech enhancement approaches in achieving optimal performance for a range of speech processing tasks, thereby challenging the notion of universal applicability. The fundamental issue in achieving universal speech enhancement lies in effectively informing the speech enhancement module about the features of downstream modules. In this study, we present a novel weighting prediction approach, which explicitly learns the task relationships from downstream training information to address the core challenge of universal speech enhancement. We found the role of deciding whether to employ data augmentation techniques as crucial downstream training information. This decision significantly impacts the expected speech and the performance of the speech enhancement module. Moreover, we introduce a novel speech enhancement network, the Plugin Speech Enhancement (Plugin-SE). The Plugin-SE is a dynamic neural network that includes the speech enhancement module, gate module, and weight prediction module. Experimental results demonstrate that the proposed Plugin-SE approach is competitive or superior to other joint training methods across various downstream tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.12746v1-abstract-full').style.display = 'none'; document.getElementById('2402.12746v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.14421">arXiv:2401.14421</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2401.14421">pdf</a>, <a href="https://arxiv.org/format/2401.14421">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Multi-Agent Based Transfer Learning for Data-Driven Air Traffic Applications </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Deng%2C+C">Chuhao Deng</a>, <a href="/search/eess?searchtype=author&amp;query=Choi%2C+H">Hong-Cheol Choi</a>, <a href="/search/eess?searchtype=author&amp;query=Park%2C+H">Hyunsang Park</a>, <a href="/search/eess?searchtype=author&amp;query=Hwang%2C+I">Inseok Hwang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.14421v1-abstract-short" style="display: inline;"> Research in developing data-driven models for Air Traffic Management (ATM) has gained a tremendous interest in recent years. However, data-driven models are known to have long training time and require large datasets to achieve good performance. To address the two issues, this paper proposes a Multi-Agent Bidirectional Encoder Representations from Transformers (MA-BERT) model that fully considers&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.14421v1-abstract-full').style.display = 'inline'; document.getElementById('2401.14421v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.14421v1-abstract-full" style="display: none;"> Research in developing data-driven models for Air Traffic Management (ATM) has gained a tremendous interest in recent years. However, data-driven models are known to have long training time and require large datasets to achieve good performance. To address the two issues, this paper proposes a Multi-Agent Bidirectional Encoder Representations from Transformers (MA-BERT) model that fully considers the multi-agent characteristic of the ATM system and learns air traffic controllers&#39; decisions, and a pre-training and fine-tuning transfer learning framework. By pre-training the MA-BERT on a large dataset from a major airport and then fine-tuning it to other airports and specific air traffic applications, a large amount of the total training time can be saved. In addition, for newly adopted procedures and constructed airports where no historical data is available, this paper shows that the pre-trained MA-BERT can achieve high performance by updating regularly with little data. The proposed transfer learning framework and MA-BERT are tested with the automatic dependent surveillance-broadcast data recorded in 3 airports in South Korea in 2019. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.14421v1-abstract-full').style.display = 'none'; document.getElementById('2401.14421v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 8 figures, submitted for IEEE Transactions on Intelligent Transportation System</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.04534">arXiv:2311.04534</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2311.04534">pdf</a>, <a href="https://arxiv.org/format/2311.04534">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Loss Masking Is Not Needed in Decoder-only Transformer for Discrete-token-based ASR </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Chen%2C+Q">Qian Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+W">Wen Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+Q">Qinglin Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Zheng%2C+S">Siqi Zheng</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+S">Shiliang Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Deng%2C+C">Chong Deng</a>, <a href="/search/eess?searchtype=author&amp;query=Ma%2C+Y">Yukun Ma</a>, <a href="/search/eess?searchtype=author&amp;query=Yu%2C+H">Hai Yu</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+J">Jiaqing Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+C">Chong Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.04534v2-abstract-short" style="display: inline;"> Recently, unified speech-text models, such as SpeechGPT, VioLA, and AudioPaLM, have achieved remarkable performance on various speech tasks. These models discretize speech signals into tokens (speech discretization) and use a shared vocabulary for both text and speech tokens. Then they train a single decoder-only Transformer on a mixture of speech tasks. However, these models rely on the Loss Mask&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.04534v2-abstract-full').style.display = 'inline'; document.getElementById('2311.04534v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.04534v2-abstract-full" style="display: none;"> Recently, unified speech-text models, such as SpeechGPT, VioLA, and AudioPaLM, have achieved remarkable performance on various speech tasks. These models discretize speech signals into tokens (speech discretization) and use a shared vocabulary for both text and speech tokens. Then they train a single decoder-only Transformer on a mixture of speech tasks. However, these models rely on the Loss Masking strategy for the ASR task, which ignores the dependency among speech tokens. In this paper, we propose to model speech tokens in an autoregressive way, similar to text. We find that applying the conventional cross-entropy loss on input speech tokens does not consistently improve the ASR performance over the Loss Masking approach. To address this issue, we propose a novel approach denoted Smoothed Label Distillation (SLD), which applies a KL divergence loss with smoothed labels on speech tokens. Our experiments show that SLD effectively models speech tokens and outperforms Loss Masking for decoder-only Transformers in ASR tasks with different speech discretization methods. The source code can be found here: https://github.com/alibaba-damo-academy/SpokenNLP/tree/main/sld <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.04534v2-abstract-full').style.display = 'none'; document.getElementById('2311.04534v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, accepted by ICASSP 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.17664">arXiv:2310.17664</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2310.17664">pdf</a>, <a href="https://arxiv.org/format/2310.17664">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Cascaded Multi-task Adaptive Learning Based on Neural Architecture Search </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Gao%2C+Y">Yingying Gao</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+S">Shilei Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Cui%2C+Z">Zihao Cui</a>, <a href="/search/eess?searchtype=author&amp;query=Deng%2C+C">Chao Deng</a>, <a href="/search/eess?searchtype=author&amp;query=Feng%2C+J">Junlan Feng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.17664v1-abstract-short" style="display: inline;"> Cascading multiple pre-trained models is an effective way to compose an end-to-end system. However, fine-tuning the full cascaded model is parameter and memory inefficient and our observations reveal that only applying adapter modules on cascaded model can not achieve considerable performance as fine-tuning. We propose an automatic and effective adaptive learning method to optimize end-to-end casc&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.17664v1-abstract-full').style.display = 'inline'; document.getElementById('2310.17664v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.17664v1-abstract-full" style="display: none;"> Cascading multiple pre-trained models is an effective way to compose an end-to-end system. However, fine-tuning the full cascaded model is parameter and memory inefficient and our observations reveal that only applying adapter modules on cascaded model can not achieve considerable performance as fine-tuning. We propose an automatic and effective adaptive learning method to optimize end-to-end cascaded multi-task models based on Neural Architecture Search (NAS) framework. The candidate adaptive operations on each specific module consist of frozen, inserting an adapter and fine-tuning. We further add a penalty item on the loss to limit the learned structure which takes the amount of trainable parameters into account. The penalty item successfully restrict the searched architecture and the proposed approach is able to search similar tuning scheme with hand-craft, compressing the optimizing parameters to 8.7% corresponding to full fine-tuning on SLURP with an even better performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.17664v1-abstract-full').style.display = 'none'; document.getElementById('2310.17664v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.13418">arXiv:2310.13418</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2310.13418">pdf</a>, <a href="https://arxiv.org/format/2310.13418">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> GenDistiller: Distilling Pre-trained Language Models based on Generative Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Gao%2C+Y">Yingying Gao</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+S">Shilei Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Cui%2C+Z">Zihao Cui</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+Y">Yanhan Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Deng%2C+C">Chao Deng</a>, <a href="/search/eess?searchtype=author&amp;query=Feng%2C+J">Junlan Feng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.13418v1-abstract-short" style="display: inline;"> Self-supervised pre-trained models such as HuBERT and WavLM leverage unlabeled speech data for representation learning and offer significantly improve for numerous downstream tasks. Despite the success of these methods, their large memory and strong computational requirements hinder their application on resource restricted devices. Therefore, this paper introduces GenDistiller, a novel knowledge d&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.13418v1-abstract-full').style.display = 'inline'; document.getElementById('2310.13418v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.13418v1-abstract-full" style="display: none;"> Self-supervised pre-trained models such as HuBERT and WavLM leverage unlabeled speech data for representation learning and offer significantly improve for numerous downstream tasks. Despite the success of these methods, their large memory and strong computational requirements hinder their application on resource restricted devices. Therefore, this paper introduces GenDistiller, a novel knowledge distillation framework to distill hidden representations from teacher network based on generative language model. The generative structure enables the proposed model to generate the target teacher hidden layers autoregressively, considering the interactions between hidden layers without instroducing additional inputs. A two-dimensional attention mechanism is implemented to ensure the causality of hidden layers, while preserving bidirectional attention in the time dimension. Experiments reveal the advantage of the generative distiller over the baseline system that predicts the hidden layers of teacher network directly without a generatvie model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.13418v1-abstract-full').style.display = 'none'; document.getElementById('2310.13418v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.02774">arXiv:2308.02774</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2308.02774">pdf</a>, <a href="https://arxiv.org/format/2308.02774">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Self-Distillation Prototypes Network: Learning Robust Speaker Representations without Supervision </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Chen%2C+Y">Yafeng Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Zheng%2C+S">Siqi Zheng</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+H">Hui Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Cheng%2C+L">Luyao Cheng</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+Q">Qian Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Deng%2C+C">Chong Deng</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+S">Shiliang Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+W">Wen Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.02774v6-abstract-short" style="display: inline;"> Training speaker-discriminative and robust speaker verification systems without explicit speaker labels remains a persistent challenge. In this paper, we propose a novel self-supervised speaker verification approach, Self-Distillation Prototypes Network (SDPN), which effectively facilitates self-supervised speaker representation learning. SDPN assigns the representation of the augmented views of a&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.02774v6-abstract-full').style.display = 'inline'; document.getElementById('2308.02774v6-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.02774v6-abstract-full" style="display: none;"> Training speaker-discriminative and robust speaker verification systems without explicit speaker labels remains a persistent challenge. In this paper, we propose a novel self-supervised speaker verification approach, Self-Distillation Prototypes Network (SDPN), which effectively facilitates self-supervised speaker representation learning. SDPN assigns the representation of the augmented views of an utterance to the same prototypes as the representation of the original view, thereby enabling effective knowledge transfer between the augmented and original views. Due to lack of negative pairs in the SDPN training process, the network tends to align positive pairs quite closely in the embedding space, a phenomenon known as model collapse. To mitigate this problem, we introduce a diversity regularization term to embeddings in SDPN. Comprehensive experiments on the VoxCeleb datasets demonstrate the superiority of SDPN among self-supervised speaker verification approaches. SDPN sets a new state-of-the-art on the VoxCeleb1 speaker verification evaluation benchmark, achieving Equal Error Rate 1.80%, 1.99%, and 3.62% for trial VoxCeleb1-O, VoxCeleb1-E and VoxCeleb1-H, without using any speaker labels in training. Ablation studies show that both proposed learnable prototypes in self-distillation network and diversity regularization contribute to the verification performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.02774v6-abstract-full').style.display = 'none'; document.getElementById('2308.02774v6-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">arXiv admin note: text overlap with arXiv:2211.04168</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.10821">arXiv:2305.10821</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2305.10821">pdf</a>, <a href="https://arxiv.org/format/2305.10821">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Locate and Beamform: Two-dimensional Locating All-neural Beamformer for Multi-channel Speech Separation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Fu%2C+Y">Yanjie Fu</a>, <a href="/search/eess?searchtype=author&amp;query=Ge%2C+M">Meng Ge</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+H">Honglong Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+N">Nan Li</a>, <a href="/search/eess?searchtype=author&amp;query=Yin%2C+H">Haoran Yin</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+L">Longbiao Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+G">Gaoyan Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Dang%2C+J">Jianwu Dang</a>, <a href="/search/eess?searchtype=author&amp;query=Deng%2C+C">Chengyun Deng</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+F">Fei Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.10821v3-abstract-short" style="display: inline;"> Recently, stunning improvements on multi-channel speech separation have been achieved by neural beamformers when direction information is available. However, most of them neglect to utilize speaker&#39;s 2-dimensional (2D) location cues contained in mixture signal, which limits the performance when two sources come from close directions. In this paper, we propose an end-to-end beamforming network for&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.10821v3-abstract-full').style.display = 'inline'; document.getElementById('2305.10821v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.10821v3-abstract-full" style="display: none;"> Recently, stunning improvements on multi-channel speech separation have been achieved by neural beamformers when direction information is available. However, most of them neglect to utilize speaker&#39;s 2-dimensional (2D) location cues contained in mixture signal, which limits the performance when two sources come from close directions. In this paper, we propose an end-to-end beamforming network for 2D location guided speech separation merely given mixture signal. It first estimates discriminable direction and 2D location cues, which imply directions the sources come from in multi views of microphones and their 2D coordinates. These cues are then integrated into location-aware neural beamformer, thus allowing accurate reconstruction of two sources&#39; speech signals. Experiments show that our proposed model not only achieves a comprehensive decent improvement compared to baseline systems, but avoids inferior performance on spatial overlapping cases. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.10821v3-abstract-full').style.display = 'none'; document.getElementById('2305.10821v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by Interspeech 2023. arXiv admin note: substantial text overlap with arXiv:2212.03401</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2303.13932">arXiv:2303.13932</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2303.13932">pdf</a>, <a href="https://arxiv.org/ps/2303.13932">ps</a>, <a href="https://arxiv.org/format/2303.13932">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Overview of the ICASSP 2023 General Meeting Understanding and Generation Challenge (MUG) </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+Q">Qinglin Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Deng%2C+C">Chong Deng</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+J">Jiaqing Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Yu%2C+H">Hai Yu</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+Q">Qian Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+W">Wen Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Yan%2C+Z">Zhijie Yan</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+J">Jinglin Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Ren%2C+Y">Yi Ren</a>, <a href="/search/eess?searchtype=author&amp;query=Zhao%2C+Z">Zhou Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2303.13932v1-abstract-short" style="display: inline;"> ICASSP2023 General Meeting Understanding and Generation Challenge (MUG) focuses on prompting a wide range of spoken language processing (SLP) research on meeting transcripts, as SLP applications are critical to improve users&#39; efficiency in grasping important information in meetings. MUG includes five tracks, including topic segmentation, topic-level and session-level extractive summarization, topi&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.13932v1-abstract-full').style.display = 'inline'; document.getElementById('2303.13932v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2303.13932v1-abstract-full" style="display: none;"> ICASSP2023 General Meeting Understanding and Generation Challenge (MUG) focuses on prompting a wide range of spoken language processing (SLP) research on meeting transcripts, as SLP applications are critical to improve users&#39; efficiency in grasping important information in meetings. MUG includes five tracks, including topic segmentation, topic-level and session-level extractive summarization, topic title generation, keyphrase extraction, and action item detection. To facilitate MUG, we construct and release a large-scale meeting dataset, the AliMeeting4MUG Corpus. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.13932v1-abstract-full').style.display = 'none'; document.getElementById('2303.13932v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Paper accepted to the 2023 IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP 2023), Rhodes, Greece</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2303.00952">arXiv:2303.00952</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2303.00952">pdf</a>, <a href="https://arxiv.org/format/2303.00952">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Towards Activated Muscle Group Estimation in the Wild </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Peng%2C+K">Kunyu Peng</a>, <a href="/search/eess?searchtype=author&amp;query=Schneider%2C+D">David Schneider</a>, <a href="/search/eess?searchtype=author&amp;query=Roitberg%2C+A">Alina Roitberg</a>, <a href="/search/eess?searchtype=author&amp;query=Yang%2C+K">Kailun Yang</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+J">Jiaming Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Deng%2C+C">Chen Deng</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+K">Kaiyu Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Sarfraz%2C+M+S">M. Saquib Sarfraz</a>, <a href="/search/eess?searchtype=author&amp;query=Stiefelhagen%2C+R">Rainer Stiefelhagen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2303.00952v5-abstract-short" style="display: inline;"> In this paper, we tackle the new task of video-based Activated Muscle Group Estimation (AMGE) aiming at identifying active muscle regions during physical activity in the wild. To this intent, we provide the MuscleMap dataset featuring &gt;15K video clips with 135 different activities and 20 labeled muscle groups. This dataset opens the vistas to multiple video-based applications in sports and rehabil&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.00952v5-abstract-full').style.display = 'inline'; document.getElementById('2303.00952v5-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2303.00952v5-abstract-full" style="display: none;"> In this paper, we tackle the new task of video-based Activated Muscle Group Estimation (AMGE) aiming at identifying active muscle regions during physical activity in the wild. To this intent, we provide the MuscleMap dataset featuring &gt;15K video clips with 135 different activities and 20 labeled muscle groups. This dataset opens the vistas to multiple video-based applications in sports and rehabilitation medicine under flexible environment constraints. The proposed MuscleMap dataset is constructed with YouTube videos, specifically targeting High-Intensity Interval Training (HIIT) physical exercise in the wild. To make the AMGE model applicable in real-life situations, it is crucial to ensure that the model can generalize well to numerous types of physical activities not present during training and involving new combinations of activated muscles. To achieve this, our benchmark also covers an evaluation setting where the model is exposed to activity types excluded from the training set. Our experiments reveal that the generalizability of existing architectures adapted for the AMGE task remains a challenge. Therefore, we also propose a new approach, TransM3E, which employs a multi-modality feature fusion mechanism between both the video transformer model and the skeleton-based graph convolution model with novel cross-modal knowledge distillation executed on multi-classification tokens. The proposed method surpasses all popular video classification models when dealing with both, previously seen and new types of physical activities. The database and code can be found at https://github.com/KPeng9510/MuscleMap. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.00952v5-abstract-full').style.display = 'none'; document.getElementById('2303.00952v5-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ACM MM 2024. The database and code can be found at https://github.com/KPeng9510/MuscleMap</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2212.03401">arXiv:2212.03401</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2212.03401">pdf</a>, <a href="https://arxiv.org/format/2212.03401">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> MIMO-DBnet: Multi-channel Input and Multiple Outputs DOA-aware Beamforming Network for Speech Separation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Fu%2C+Y">Yanjie Fu</a>, <a href="/search/eess?searchtype=author&amp;query=Yin%2C+H">Haoran Yin</a>, <a href="/search/eess?searchtype=author&amp;query=Ge%2C+M">Meng Ge</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+L">Longbiao Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+G">Gaoyan Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Dang%2C+J">Jianwu Dang</a>, <a href="/search/eess?searchtype=author&amp;query=Deng%2C+C">Chengyun Deng</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+F">Fei Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2212.03401v1-abstract-short" style="display: inline;"> Recently, many deep learning based beamformers have been proposed for multi-channel speech separation. Nevertheless, most of them rely on extra cues known in advance, such as speaker feature, face image or directional information. In this paper, we propose an end-to-end beamforming network for direction guided speech separation given merely the mixture signal, namely MIMO-DBnet. Specifically, we d&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.03401v1-abstract-full').style.display = 'inline'; document.getElementById('2212.03401v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2212.03401v1-abstract-full" style="display: none;"> Recently, many deep learning based beamformers have been proposed for multi-channel speech separation. Nevertheless, most of them rely on extra cues known in advance, such as speaker feature, face image or directional information. In this paper, we propose an end-to-end beamforming network for direction guided speech separation given merely the mixture signal, namely MIMO-DBnet. Specifically, we design a multi-channel input and multiple outputs architecture to predict the direction-of-arrival based embeddings and beamforming weights for each source. The precisely estimated directional embedding provides quite effective spatial discrimination guidance for the neural beamformer to offset the effect of phase wrapping, thus allowing more accurate reconstruction of two sources&#39; speech signals. Experiments show that our proposed MIMO-DBnet not only achieves a comprehensive decent improvement compared to baseline systems, but also maintain the performance on high frequency bands when phase wrapping occurs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.03401v1-abstract-full').style.display = 'none'; document.getElementById('2212.03401v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 December, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to ICASSP 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2211.00206">arXiv:2211.00206</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2211.00206">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> A Primary Frequency Control Strategy for Variable-Speed Pumped-Storage Plant in Power Generation Based on Adaptive Model Predictive Control </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Xu%2C+Z">Zhenghua Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Deng%2C+C">Changhong Deng</a>, <a href="/search/eess?searchtype=author&amp;query=Yang%2C+Q">Qiuling Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2211.00206v1-abstract-short" style="display: inline;"> Variable-speed pumped-storage (VSPS) has great potential in helping solve the frequency control problem caused by low inertia, owing to its remarkable flexibility beyond conventional fixed-speed one, to make better use of which, a primary frequency control strategy based on adaptive model predictive control (AMPC) is proposed in this paper for VSPS plant in power generation. </span> <span class="abstract-full has-text-grey-dark mathjax" id="2211.00206v1-abstract-full" style="display: none;"> Variable-speed pumped-storage (VSPS) has great potential in helping solve the frequency control problem caused by low inertia, owing to its remarkable flexibility beyond conventional fixed-speed one, to make better use of which, a primary frequency control strategy based on adaptive model predictive control (AMPC) is proposed in this paper for VSPS plant in power generation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.00206v1-abstract-full').style.display = 'none'; document.getElementById('2211.00206v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 9 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2210.01434">arXiv:2210.01434</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2210.01434">pdf</a>, <a href="https://arxiv.org/ps/2210.01434">ps</a>, <a href="https://arxiv.org/format/2210.01434">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Beamforming Design and Trajectory Optimization for UAV-Empowered Adaptable Integrated Sensing and Communication </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Deng%2C+C">Cailian Deng</a>, <a href="/search/eess?searchtype=author&amp;query=Fang%2C+X">Xuming Fang</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+X">Xianbin Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2210.01434v1-abstract-short" style="display: inline;"> Unmanned aerial vehicle (UAV) has high flexibility and controllable mobility, therefore it is considered as a promising enabler for future integrated sensing and communication (ISAC). In this paper, we propose a novel adaptable ISAC (AISAC) mechanism in the UAV-enabled system, where the UAV performs sensing on demand during communication and the sensing duration is configured flexibly according to&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.01434v1-abstract-full').style.display = 'inline'; document.getElementById('2210.01434v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2210.01434v1-abstract-full" style="display: none;"> Unmanned aerial vehicle (UAV) has high flexibility and controllable mobility, therefore it is considered as a promising enabler for future integrated sensing and communication (ISAC). In this paper, we propose a novel adaptable ISAC (AISAC) mechanism in the UAV-enabled system, where the UAV performs sensing on demand during communication and the sensing duration is configured flexibly according to the application requirements rather than keeping the same with the communication duration. Our designed mechanism avoids the excessive sensing and waste of radio resources, therefore improving the resource utilization and system performance. In the UAV-enabled AISAC system, we aim at maximizing the average system throughput by optimizing the communication and sensing beamforming as well as UAV trajectory while guaranteeing the quality-of-service requirements of communication and sensing. To efficiently solve the considered non-convex optimization problem, we first propose an efficient alternating optimization algorithm to optimize the communication and sensing beamforming for a given UAV location, and then develop a low-complexity joint beamforming and UAV trajectory optimization algorithm that sequentially searches the optimal UAV location until reaching the final location. Numerical results validate the superiority of the proposed adaptable mechanism and the effectiveness of the designed algorithm. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.01434v1-abstract-full').style.display = 'none'; document.getElementById('2210.01434v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This work has been submitted to the IEEE for possible publication</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2209.13915">arXiv:2209.13915</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2209.13915">pdf</a>, <a href="https://arxiv.org/ps/2209.13915">ps</a>, <a href="https://arxiv.org/format/2209.13915">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Joint Optimization of Resource Allocation and Trajectory Control for Mobile Group Users in Fixed-Wing UAV-Enabled Wireless Network </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Yan%2C+X">Xuezhen Yan</a>, <a href="/search/eess?searchtype=author&amp;query=Fang%2C+X">Xuming Fang</a>, <a href="/search/eess?searchtype=author&amp;query=Deng%2C+C">Cailian Deng</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+X">Xianbin Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2209.13915v1-abstract-short" style="display: inline;"> Owing to the controlling flexibility and cost-effectiveness, fixed-wing unmanned aerial vehicles (UAVs) are expected to serve as flying base stations (BSs) in the air-ground integrated network. By exploiting the mobility of UAVs, controllable coverage can be provided for mobile group users (MGUs) under challenging scenarios or even somewhere without communication infrastructure. However, in such d&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2209.13915v1-abstract-full').style.display = 'inline'; document.getElementById('2209.13915v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2209.13915v1-abstract-full" style="display: none;"> Owing to the controlling flexibility and cost-effectiveness, fixed-wing unmanned aerial vehicles (UAVs) are expected to serve as flying base stations (BSs) in the air-ground integrated network. By exploiting the mobility of UAVs, controllable coverage can be provided for mobile group users (MGUs) under challenging scenarios or even somewhere without communication infrastructure. However, in such dual mobility scenario where the UAV and MGUs are all moving, both the non-hovering feature of the fixed-wing UAV and the movement of MGUs will exacerbate the dynamic changes of user scheduling, which eventually leads to the degradation of MGUs&#39; quality-of-service (QoS). In this paper, we propose a fixed-wing UAV-enabled wireless network architecture to provide moving coverage for MGUs. In order to achieve fairness among MGUs, we maximize the minimum average throughput between all users by jointly optimizing the user scheduling, resource allocation, and UAV trajectory control under the constraints on users&#39; QoS requirements, communication resources, and UAV trajectory switching. Considering the optimization problem is mixed-integer non-convex, we decompose it into three optimization subproblems. An efficient algorithm is proposed to solve these three subproblems alternately till the convergence is realized. Simulation results demonstrate that the proposed algorithm can significantly improve the minimum average throughput of MGUs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2209.13915v1-abstract-full').style.display = 'none'; document.getElementById('2209.13915v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 September, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">30 pages, 9 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2208.13952">arXiv:2208.13952</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2208.13952">pdf</a>, <a href="https://arxiv.org/format/2208.13952">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optics">physics.optics</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TGRS.2022.3223649">10.1109/TGRS.2022.3223649 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Micro-Vibration Modes Reconstruction Based on Micro-Doppler Coincidence Imaging </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Liu%2C+S">Shuang Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Deng%2C+C">Chenjin Deng</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+C">Chaoran Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Bo%2C+Z">Zunwang Bo</a>, <a href="/search/eess?searchtype=author&amp;query=Han%2C+S">Shensheng Han</a>, <a href="/search/eess?searchtype=author&amp;query=Lin%2C+Z">Zihuai Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2208.13952v1-abstract-short" style="display: inline;"> Micro-vibration, a ubiquitous nature phenomenon, can be seen as a characteristic feature on the objects, these vibrations always have tiny amplitudes which are much less than the wavelengths of the sensing systems, thus these motions information can only be reflected in the phase item of echo. Normally the conventional radar system can detect these micro vibrations through the time frequency analy&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2208.13952v1-abstract-full').style.display = 'inline'; document.getElementById('2208.13952v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2208.13952v1-abstract-full" style="display: none;"> Micro-vibration, a ubiquitous nature phenomenon, can be seen as a characteristic feature on the objects, these vibrations always have tiny amplitudes which are much less than the wavelengths of the sensing systems, thus these motions information can only be reflected in the phase item of echo. Normally the conventional radar system can detect these micro vibrations through the time frequency analyzing, but these vibration characteristics can only be reflected by time-frequency spectrum, the spatial distribution of these micro vibrations can not be reconstructed precisely. Ghost imaging (GI), a novel imaging method also known as Coincidence Imaging that originated in the quantum and optical fields, can reconstruct unknown images using computational methods. To reconstruct the spatial distribution of micro vibrations, this paper proposes a new method based on a coincidence imaging system. A detailed model of target micro-vibration is created first, taking into account two categories: discrete and continuous targets. We use the first-order field correlation feature to obtain objective different micro vibration distribution based on the complex target models and time-frequency analysis in this work. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2208.13952v1-abstract-full').style.display = 'none'; document.getElementById('2208.13952v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 August, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2206.12774">arXiv:2206.12774</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2206.12774">pdf</a>, <a href="https://arxiv.org/format/2206.12774">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Meta Auxiliary Learning for Low-resource Spoken Language Understanding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Gao%2C+Y">Yingying Gao</a>, <a href="/search/eess?searchtype=author&amp;query=Feng%2C+J">Junlan Feng</a>, <a href="/search/eess?searchtype=author&amp;query=Deng%2C+C">Chao Deng</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+S">Shilei Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2206.12774v1-abstract-short" style="display: inline;"> Spoken language understanding (SLU) treats automatic speech recognition (ASR) and natural language understanding (NLU) as a unified task and usually suffers from data scarcity. We exploit an ASR and NLU joint training method based on meta auxiliary learning to improve the performance of low-resource SLU task by only taking advantage of abundant manual transcriptions of speech data. One obvious adv&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.12774v1-abstract-full').style.display = 'inline'; document.getElementById('2206.12774v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2206.12774v1-abstract-full" style="display: none;"> Spoken language understanding (SLU) treats automatic speech recognition (ASR) and natural language understanding (NLU) as a unified task and usually suffers from data scarcity. We exploit an ASR and NLU joint training method based on meta auxiliary learning to improve the performance of low-resource SLU task by only taking advantage of abundant manual transcriptions of speech data. One obvious advantage of such method is that it provides a flexible framework to implement a low-resource SLU training task without requiring access to any further semantic annotations. In particular, a NLU model is taken as label generation network to predict intent and slot tags from texts; a multi-task network trains ASR task and SLU task synchronously from speech; and the predictions of label generation network are delivered to the multi-task network as semantic targets. The efficiency of the proposed algorithm is demonstrated with experiments on the public CATSLU dataset, which produces more suitable ASR hypotheses for the downstream NLU task. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.12774v1-abstract-full').style.display = 'none'; document.getElementById('2206.12774v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 June, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2206.08031">arXiv:2206.08031</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2206.08031">pdf</a>, <a href="https://arxiv.org/format/2206.08031">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> A CTC Triggered Siamese Network with Spatial-Temporal Dropout for Speech Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Gao%2C+Y">Yingying Gao</a>, <a href="/search/eess?searchtype=author&amp;query=Feng%2C+J">Junlan Feng</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+T">Tianrui Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Deng%2C+C">Chao Deng</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+S">Shilei Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2206.08031v2-abstract-short" style="display: inline;"> Siamese networks have shown effective results in unsupervised visual representation learning. These models are designed to learn an invariant representation of two augmentations for one input by maximizing their similarity. In this paper, we propose an effective Siamese network to improve the robustness of End-to-End automatic speech recognition (ASR). We introduce spatial-temporal dropout to supp&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.08031v2-abstract-full').style.display = 'inline'; document.getElementById('2206.08031v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2206.08031v2-abstract-full" style="display: none;"> Siamese networks have shown effective results in unsupervised visual representation learning. These models are designed to learn an invariant representation of two augmentations for one input by maximizing their similarity. In this paper, we propose an effective Siamese network to improve the robustness of End-to-End automatic speech recognition (ASR). We introduce spatial-temporal dropout to support a more violent disturbance for Siamese-ASR framework. Besides, we also relax the similarity regularization to maximize the similarities of distributions on the frames that connectionist temporal classification (CTC) spikes occur rather than on all of them. The efficiency of the proposed architecture is evaluated on two benchmarks, AISHELL-1 and Librispeech, resulting in 7.13% and 6.59% relative character error rate (CER) and word error rate (WER) reductions respectively. Analysis shows that our proposed approach brings a better uniformity for the trained model and enlarges the CTC spikes obviously. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.08031v2-abstract-full').style.display = 'none'; document.getElementById('2206.08031v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 June, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 June, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2202.04250">arXiv:2202.04250</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2202.04250">pdf</a>, <a href="https://arxiv.org/format/2202.04250">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> GenAD: General Representations of Multivariate Time Seriesfor Anomaly Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Hua%2C+X">Xiaolei Hua</a>, <a href="/search/eess?searchtype=author&amp;query=Zhu%2C+L">Lin Zhu</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+S">Shenglin Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+Z">Zeyan Li</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+S">Su Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Zhou%2C+D">Dong Zhou</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+S">Shuo Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Deng%2C+C">Chao Deng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2202.04250v1-abstract-short" style="display: inline;"> The reliability of wireless base stations in China Mobile is of vital importance, because the cell phone users are connected to the stations and the behaviors of the stations are directly related to user experience. Although the monitoring of the station behaviors can be realized by anomaly detection on multivariate time series, due to complex correlations and various temporal patterns of multivar&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2202.04250v1-abstract-full').style.display = 'inline'; document.getElementById('2202.04250v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2202.04250v1-abstract-full" style="display: none;"> The reliability of wireless base stations in China Mobile is of vital importance, because the cell phone users are connected to the stations and the behaviors of the stations are directly related to user experience. Although the monitoring of the station behaviors can be realized by anomaly detection on multivariate time series, due to complex correlations and various temporal patterns of multivariate series in large-scale stations, building a general unsupervised anomaly detection model with a higher F1-score remains a challenging task. In this paper, we propose a General representation of multivariate time series for Anomaly Detection(GenAD). First, we pre-train a general model on large-scale wireless base stations with self-supervision, which can be easily transferred to a specific station anomaly detection with a small amount of training data. Second, we employ Multi-Correlation Attention and Time-Series Attention to represent the correlations and temporal patterns of the stations. With the above innovations, GenAD increases F1-score by total 9% on real-world datasets in China Mobile, while the performance does not significantly degrade on public datasets with only 10% of the training data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2202.04250v1-abstract-full').style.display = 'none'; document.getElementById('2202.04250v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 February, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2111.14220">arXiv:2111.14220</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2111.14220">pdf</a>, <a href="https://arxiv.org/format/2111.14220">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> On the Robustness and Generalization of Deep Learning Driven Full Waveform Inversion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Deng%2C+C">Chengyuan Deng</a>, <a href="/search/eess?searchtype=author&amp;query=Lin%2C+Y">Youzuo Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2111.14220v1-abstract-short" style="display: inline;"> The data-driven approach has been demonstrated as a promising technique to solve complicated scientific problems. Full Waveform Inversion (FWI) is commonly epitomized as an image-to-image translation task, which motivates the use of deep neural networks as an end-to-end solution. Despite being trained with synthetic data, the deep learning-driven FWI is expected to perform well when evaluated with&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2111.14220v1-abstract-full').style.display = 'inline'; document.getElementById('2111.14220v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2111.14220v1-abstract-full" style="display: none;"> The data-driven approach has been demonstrated as a promising technique to solve complicated scientific problems. Full Waveform Inversion (FWI) is commonly epitomized as an image-to-image translation task, which motivates the use of deep neural networks as an end-to-end solution. Despite being trained with synthetic data, the deep learning-driven FWI is expected to perform well when evaluated with sufficient real-world data. In this paper, we study such properties by asking: how robust are these deep neural networks and how do they generalize? For robustness, we prove the upper bounds of the deviation between the predictions from clean and noisy data. Moreover, we demonstrate an interplay between the noise level and the additional gain of loss. For generalization, we prove a norm-based generalization error upper bound via a stability-generalization framework. Experimental results on seismic FWI datasets corroborate with the theoretical results, shedding light on a better understanding of utilizing Deep Learning for complicated scientific applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2111.14220v1-abstract-full').style.display = 'none'; document.getElementById('2111.14220v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 November, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2111.02926">arXiv:2111.02926</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2111.02926">pdf</a>, <a href="https://arxiv.org/format/2111.02926">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> OpenFWI: Large-Scale Multi-Structural Benchmark Datasets for Seismic Full Waveform Inversion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Deng%2C+C">Chengyuan Deng</a>, <a href="/search/eess?searchtype=author&amp;query=Feng%2C+S">Shihang Feng</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+H">Hanchen Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+X">Xitong Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Jin%2C+P">Peng Jin</a>, <a href="/search/eess?searchtype=author&amp;query=Feng%2C+Y">Yinan Feng</a>, <a href="/search/eess?searchtype=author&amp;query=Zeng%2C+Q">Qili Zeng</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+Y">Yinpeng Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Lin%2C+Y">Youzuo Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2111.02926v6-abstract-short" style="display: inline;"> Full waveform inversion (FWI) is widely used in geophysics to reconstruct high-resolution velocity maps from seismic data. The recent success of data-driven FWI methods results in a rapidly increasing demand for open datasets to serve the geophysics community. We present OpenFWI, a collection of large-scale multi-structural benchmark datasets, to facilitate diversified, rigorous, and reproducible&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2111.02926v6-abstract-full').style.display = 'inline'; document.getElementById('2111.02926v6-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2111.02926v6-abstract-full" style="display: none;"> Full waveform inversion (FWI) is widely used in geophysics to reconstruct high-resolution velocity maps from seismic data. The recent success of data-driven FWI methods results in a rapidly increasing demand for open datasets to serve the geophysics community. We present OpenFWI, a collection of large-scale multi-structural benchmark datasets, to facilitate diversified, rigorous, and reproducible research on FWI. In particular, OpenFWI consists of 12 datasets (2.1TB in total) synthesized from multiple sources. It encompasses diverse domains in geophysics (interface, fault, CO2 reservoir, etc.), covers different geological subsurface structures (flat, curve, etc.), and contains various amounts of data samples (2K - 67K). It also includes a dataset for 3D FWI. Moreover, we use OpenFWI to perform benchmarking over four deep learning methods, covering both supervised and unsupervised learning regimes. Along with the benchmarks, we implement additional experiments, including physics-driven methods, complexity analysis, generalization study, uncertainty quantification, and so on, to sharpen our understanding of datasets and methods. The studies either provide valuable insights into the datasets and the performance, or uncover their current limitations. We hope OpenFWI supports prospective research on FWI and inspires future open-source efforts on AI for science. All datasets and related information can be accessed through our website at https://openfwi-lanl.github.io/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2111.02926v6-abstract-full').style.display = 'none'; document.getElementById('2111.02926v6-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 November, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This manuscript has been accepted by NeurIPS 2022 dataset and benchmark track</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2106.15765">arXiv:2106.15765</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2106.15765">pdf</a>, <a href="https://arxiv.org/format/2106.15765">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optics">physics.optics</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1364/PRJ.435256">10.1364/PRJ.435256 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> 10-mega pixel snapshot compressive imaging with a hybrid coded aperture </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+Z">Zhihong Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Deng%2C+C">Chao Deng</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+Y">Yang Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Yuan%2C+X">Xin Yuan</a>, <a href="/search/eess?searchtype=author&amp;query=Suo%2C+J">Jinli Suo</a>, <a href="/search/eess?searchtype=author&amp;query=Dai%2C+Q">Qionghai Dai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2106.15765v2-abstract-short" style="display: inline;"> High resolution images are widely used in our daily life, whereas high-speed video capture is challenging due to the low frame rate of cameras working at the high resolution mode. Digging deeper, the main bottleneck lies in the low throughput of existing imaging systems. Towards this end, snapshot compressive imaging (SCI) was proposed as a promising solution to improve the throughput of imaging s&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2106.15765v2-abstract-full').style.display = 'inline'; document.getElementById('2106.15765v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2106.15765v2-abstract-full" style="display: none;"> High resolution images are widely used in our daily life, whereas high-speed video capture is challenging due to the low frame rate of cameras working at the high resolution mode. Digging deeper, the main bottleneck lies in the low throughput of existing imaging systems. Towards this end, snapshot compressive imaging (SCI) was proposed as a promising solution to improve the throughput of imaging systems by compressive sampling and computational reconstruction. During acquisition, multiple high-speed images are encoded and collapsed to a single measurement. After this, algorithms are employed to retrieve the video frames from the coded snapshot. Recently developed Plug-and-Play (PnP) algorithms make it possible for SCI reconstruction in large-scale problems. However, the lack of high-resolution encoding systems still precludes SCI&#39;s wide application. In this paper, we build a novel hybrid coded aperture snapshot compressive imaging (HCA-SCI) system by incorporating a dynamic liquid crystal on silicon and a high-resolution lithography mask. We further implement a PnP reconstruction algorithm with cascaded denoisers for high quality reconstruction. Based on the proposed HCA-SCI system and algorithm, we achieve a 10-mega pixel SCI system to capture high-speed scenes, leading to a high throughput of 4.6G voxels per second. Both simulation and real data experiments verify the feasibility and performance of our proposed HCA-SCI scheme. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2106.15765v2-abstract-full').style.display = 'none'; document.getElementById('2106.15765v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 August, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 June, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 8 figures, accepted by Photonics Research</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2011.02109">arXiv:2011.02109</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2011.02109">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Deep Multi-task Network for Delay Estimation and Echo Cancellation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+Y">Yi Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Deng%2C+C">Chengyun Deng</a>, <a href="/search/eess?searchtype=author&amp;query=Ma%2C+S">Shiqian Ma</a>, <a href="/search/eess?searchtype=author&amp;query=Sha%2C+Y">Yongtao Sha</a>, <a href="/search/eess?searchtype=author&amp;query=Song%2C+H">Hui Song</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2011.02109v2-abstract-short" style="display: inline;"> Echo path delay (or ref-delay) estimation is a big challenge in acoustic echo cancellation. Different devices may introduce various ref-delay in practice. Ref-delay inconsistency slows down the convergence of adaptive filters, and also degrades the performance of deep learning models due to &#39;unseen&#39; ref-delays in the training set. In this paper, a multi-task network is proposed to address both ref&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2011.02109v2-abstract-full').style.display = 'inline'; document.getElementById('2011.02109v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2011.02109v2-abstract-full" style="display: none;"> Echo path delay (or ref-delay) estimation is a big challenge in acoustic echo cancellation. Different devices may introduce various ref-delay in practice. Ref-delay inconsistency slows down the convergence of adaptive filters, and also degrades the performance of deep learning models due to &#39;unseen&#39; ref-delays in the training set. In this paper, a multi-task network is proposed to address both ref-delay estimation and echo cancellation tasks. The proposed architecture consists of two convolutional recurrent networks (CRNNs) to estimate the echo and enhanced signals separately, as well as a fully-connected (FC) network to estimate the echo path delay. Echo signal is first predicted, and then is combined with reference signal together for delay estimation. At the end, delay compensated reference and microphone signals are used to predict the enhanced target signal. Experimental results suggest that the proposed method makes reliable delay estimation and outperforms the existing state-of-the-art solutions in inconsistent echo path delay scenarios, in terms of echo return loss enhancement (ERLE) and perceptual evaluation of speech quality (PESQ). Furthermore, a data augmentation method is studied to evaluate the model performance on different portion of synthetical data with artificially introduced ref-delay. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2011.02109v2-abstract-full').style.display = 'none'; document.getElementById('2011.02109v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 August, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 November, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by Interspeech 2020</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2011.02102">arXiv:2011.02102</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2011.02102">pdf</a>, <a href="https://arxiv.org/format/2011.02102">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Robust Speaker Extraction Network Based on Iterative Refined Adaptation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Deng%2C+C">Chengyun Deng</a>, <a href="/search/eess?searchtype=author&amp;query=Ma%2C+S">Shiqian Ma</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+Y">Yi Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Sha%2C+Y">Yongtao Sha</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+H">Hui Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Song%2C+H">Hui Song</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+X">Xiangang Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2011.02102v2-abstract-short" style="display: inline;"> Speaker extraction aims to extract target speech signal from a multi-talker environment with interference speakers and surrounding noise, given the target speaker&#39;s reference information. Most speaker extraction systems achieve satisfactory performance on the premise that the test speakers have been encountered during training time. Such systems suffer from performance degradation given unseen tar&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2011.02102v2-abstract-full').style.display = 'inline'; document.getElementById('2011.02102v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2011.02102v2-abstract-full" style="display: none;"> Speaker extraction aims to extract target speech signal from a multi-talker environment with interference speakers and surrounding noise, given the target speaker&#39;s reference information. Most speaker extraction systems achieve satisfactory performance on the premise that the test speakers have been encountered during training time. Such systems suffer from performance degradation given unseen target speakers and/or mismatched reference voiceprint information. In this paper we propose a novel strategy named Iterative Refined Adaptation (IRA) to improve the robustness and generalization capability of speaker extraction systems in the aforementioned scenarios. Given an initial speaker embedding encoded by an auxiliary network, the extraction network can obtain a latent representation of the target speaker, which is fed back to the auxiliary network to get a refined embedding to provide more accurate guidance for the extraction network. Experiments on WSJ0-2mix-extr and WHAM! dataset confirm the superior performance of the proposed method over the network without IRA in terms of SI-SDR and PESQ improvement. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2011.02102v2-abstract-full').style.display = 'none'; document.getElementById('2011.02102v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 August, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 November, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by Interspeech 2021</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2007.14974">arXiv:2007.14974</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2007.14974">pdf</a>, <a href="https://arxiv.org/format/2007.14974">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> On Loss Functions and Recurrency Training for GAN-based Speech Enhancement Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+Z">Zhuohuang Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Deng%2C+C">Chengyun Deng</a>, <a href="/search/eess?searchtype=author&amp;query=Shen%2C+Y">Yi Shen</a>, <a href="/search/eess?searchtype=author&amp;query=Williamson%2C+D+S">Donald S. Williamson</a>, <a href="/search/eess?searchtype=author&amp;query=Sha%2C+Y">Yongtao Sha</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+Y">Yi Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Song%2C+H">Hui Song</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+X">Xiangang Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2007.14974v3-abstract-short" style="display: inline;"> Recent work has shown that it is feasible to use generative adversarial networks (GANs) for speech enhancement, however, these approaches have not been compared to state-of-the-art (SOTA) non GAN-based approaches. Additionally, many loss functions have been proposed for GAN-based approaches, but they have not been adequately compared. In this study, we propose novel convolutional recurrent GAN (CR&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2007.14974v3-abstract-full').style.display = 'inline'; document.getElementById('2007.14974v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2007.14974v3-abstract-full" style="display: none;"> Recent work has shown that it is feasible to use generative adversarial networks (GANs) for speech enhancement, however, these approaches have not been compared to state-of-the-art (SOTA) non GAN-based approaches. Additionally, many loss functions have been proposed for GAN-based approaches, but they have not been adequately compared. In this study, we propose novel convolutional recurrent GAN (CRGAN) architectures for speech enhancement. Multiple loss functions are adopted to enable direct comparisons to other GAN-based systems. The benefits of including recurrent layers are also explored. Our results show that the proposed CRGAN model outperforms the SOTA GAN-based models using the same loss functions and it outperforms other non-GAN based systems, indicating the benefits of using a GAN for speech enhancement. Overall, the CRGAN model that combines an objective metric loss function with the mean squared error (MSE) provides the best performance over comparison approaches across many evaluation metrics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2007.14974v3-abstract-full').style.display = 'none'; document.getElementById('2007.14974v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 December, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 July, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted by Interspeech2020, 5 pages, 2 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2007.13401">arXiv:2007.13401</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2007.13401">pdf</a>, <a href="https://arxiv.org/ps/2007.13401">ps</a>, <a href="https://arxiv.org/format/2007.13401">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> IEEE 802.11be-Wi-Fi 7: New Challenges and Opportunities </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Deng%2C+C">Cailian Deng</a>, <a href="/search/eess?searchtype=author&amp;query=Fang%2C+X">Xuming Fang</a>, <a href="/search/eess?searchtype=author&amp;query=Han%2C+X">Xiao Han</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+X">Xianbin Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Yan%2C+L">Li Yan</a>, <a href="/search/eess?searchtype=author&amp;query=He%2C+R">Rong He</a>, <a href="/search/eess?searchtype=author&amp;query=Long%2C+Y">Yan Long</a>, <a href="/search/eess?searchtype=author&amp;query=Guo%2C+Y">Yuchen Guo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2007.13401v3-abstract-short" style="display: inline;"> With the emergence of 4k/8k video, the throughput requirement of video delivery will keep grow to tens of Gbps. Other new high-throughput and low-latency video applications including augmented reality (AR), virtual reality (VR), and online gaming, are also proliferating. Due to the related stringent requirements, supporting these applications over wireless local area network (WLAN) is far beyond t&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2007.13401v3-abstract-full').style.display = 'inline'; document.getElementById('2007.13401v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2007.13401v3-abstract-full" style="display: none;"> With the emergence of 4k/8k video, the throughput requirement of video delivery will keep grow to tens of Gbps. Other new high-throughput and low-latency video applications including augmented reality (AR), virtual reality (VR), and online gaming, are also proliferating. Due to the related stringent requirements, supporting these applications over wireless local area network (WLAN) is far beyond the capabilities of the new WLAN standard -- IEEE 802.11ax. To meet these emerging demands, the IEEE 802.11 will release a new amendment standard IEEE 802.11be -- Extremely High Throughput (EHT), also known as Wireless-Fidelity (Wi-Fi) 7. This article provides the comprehensive survey on the key medium access control (MAC) layer techniques and physical layer (PHY) techniques being discussed in the EHT task group, including the channelization and tone plan, multiple resource units (multi-RU) support, 4096 quadrature amplitude modulation (4096-QAM), preamble designs, multiple link operations (e.g., multi-link aggregation and channel access), multiple input multiple output (MIMO) enhancement, multiple access point (multi-AP) coordination (e.g., multi-AP joint transmission), enhanced link adaptation and retransmission protocols (e.g., hybrid automatic repeat request (HARQ)). This survey covers both the critical technologies being discussed in EHT standard and the related latest progresses from worldwide research. Besides, the potential developments beyond EHT are discussed to provide some possible future research directions for WLAN. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2007.13401v3-abstract-full').style.display = 'none'; document.getElementById('2007.13401v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 August, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 July, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted for publication in IEEE Communications Surveys and Tutorials</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1912.01852">arXiv:1912.01852</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1912.01852">pdf</a>, <a href="https://arxiv.org/format/1912.01852">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> PitchNet: Unsupervised Singing Voice Conversion with Pitch Adversarial Network </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Deng%2C+C">Chengqi Deng</a>, <a href="/search/eess?searchtype=author&amp;query=Yu%2C+C">Chengzhu Yu</a>, <a href="/search/eess?searchtype=author&amp;query=Lu%2C+H">Heng Lu</a>, <a href="/search/eess?searchtype=author&amp;query=Weng%2C+C">Chao Weng</a>, <a href="/search/eess?searchtype=author&amp;query=Yu%2C+D">Dong Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1912.01852v2-abstract-short" style="display: inline;"> Singing voice conversion is to convert a singer&#39;s voice to another one&#39;s voice without changing singing content. Recent work shows that unsupervised singing voice conversion can be achieved with an autoencoder-based approach [1]. However, the converted singing voice can be easily out of key, showing that the existing approach cannot model the pitch information precisely. In this paper, we propose&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1912.01852v2-abstract-full').style.display = 'inline'; document.getElementById('1912.01852v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1912.01852v2-abstract-full" style="display: none;"> Singing voice conversion is to convert a singer&#39;s voice to another one&#39;s voice without changing singing content. Recent work shows that unsupervised singing voice conversion can be achieved with an autoencoder-based approach [1]. However, the converted singing voice can be easily out of key, showing that the existing approach cannot model the pitch information precisely. In this paper, we propose to advance the existing unsupervised singing voice conversion method proposed in [1] to achieve more accurate pitch translation and flexible pitch manipulation. Specifically, the proposed PitchNet added an adversarially trained pitch regression network to enforce the encoder network to learn pitch invariant phoneme representation, and a separate module to feed pitch extracted from the source audio to the decoder network. Our evaluation shows that the proposed method can greatly improve the quality of the converted singing voice (2.92 vs 3.75 in MOS). We also demonstrate that the pitch of converted singing can be easily controlled during generation by changing the levels of the extracted pitch before passing it to the decoder network. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1912.01852v2-abstract-full').style.display = 'none'; document.getElementById('1912.01852v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 December, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICASSP 2020</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1901.07042">arXiv:1901.07042</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1901.07042">pdf</a>, <a href="https://arxiv.org/format/1901.07042">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> MIMIC-CXR-JPG, a large publicly available database of labeled chest radiographs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Johnson%2C+A+E+W">Alistair E. W. Johnson</a>, <a href="/search/eess?searchtype=author&amp;query=Pollard%2C+T+J">Tom J. Pollard</a>, <a href="/search/eess?searchtype=author&amp;query=Greenbaum%2C+N+R">Nathaniel R. Greenbaum</a>, <a href="/search/eess?searchtype=author&amp;query=Lungren%2C+M+P">Matthew P. Lungren</a>, <a href="/search/eess?searchtype=author&amp;query=Deng%2C+C">Chih-ying Deng</a>, <a href="/search/eess?searchtype=author&amp;query=Peng%2C+Y">Yifan Peng</a>, <a href="/search/eess?searchtype=author&amp;query=Lu%2C+Z">Zhiyong Lu</a>, <a href="/search/eess?searchtype=author&amp;query=Mark%2C+R+G">Roger G. Mark</a>, <a href="/search/eess?searchtype=author&amp;query=Berkowitz%2C+S+J">Seth J. Berkowitz</a>, <a href="/search/eess?searchtype=author&amp;query=Horng%2C+S">Steven Horng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1901.07042v5-abstract-short" style="display: inline;"> Chest radiography is an extremely powerful imaging modality, allowing for a detailed inspection of a patient&#39;s thorax, but requiring specialized training for proper interpretation. With the advent of high performance general purpose computer vision algorithms, the accurate automated analysis of chest radiographs is becoming increasingly of interest to researchers. However, a key challenge in the d&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1901.07042v5-abstract-full').style.display = 'inline'; document.getElementById('1901.07042v5-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1901.07042v5-abstract-full" style="display: none;"> Chest radiography is an extremely powerful imaging modality, allowing for a detailed inspection of a patient&#39;s thorax, but requiring specialized training for proper interpretation. With the advent of high performance general purpose computer vision algorithms, the accurate automated analysis of chest radiographs is becoming increasingly of interest to researchers. However, a key challenge in the development of these techniques is the lack of sufficient data. Here we describe MIMIC-CXR-JPG v2.0.0, a large dataset of 377,110 chest x-rays associated with 227,827 imaging studies sourced from the Beth Israel Deaconess Medical Center between 2011 - 2016. Images are provided with 14 labels derived from two natural language processing tools applied to the corresponding free-text radiology reports. MIMIC-CXR-JPG is derived entirely from the MIMIC-CXR database, and aims to provide a convenient processed version of MIMIC-CXR, as well as to provide a standard reference for data splits and image labels. All images have been de-identified to protect patient privacy. The dataset is made freely available to facilitate and encourage a wide range of research in medical computer vision. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1901.07042v5-abstract-full').style.display = 'none'; document.getElementById('1901.07042v5-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2019; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 January, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2019. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1811.03455">arXiv:1811.03455</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1811.03455">pdf</a>, <a href="https://arxiv.org/format/1811.03455">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> High fidelity single-pixel imaging </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Deng%2C+C">Chao Deng</a>, <a href="/search/eess?searchtype=author&amp;query=Hu%2C+X">Xuemei Hu</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+X">Xiaoxu Li</a>, <a href="/search/eess?searchtype=author&amp;query=Suo%2C+J">Jinli Suo</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+Z">Zhili Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Dai%2C+Q">Qionghai Dai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1811.03455v1-abstract-short" style="display: inline;"> Single-pixel imaging (SPI) is an emerging technique which has attracts wide attention in various research fields. However, restricted by the low reconstruction quality and large amount of measurements, the practical application is still in its infancy. Inspired by the fact that natural scenes exhibit unique degenerate structures in the low dimensional subspace, we propose to take advantage of the&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1811.03455v1-abstract-full').style.display = 'inline'; document.getElementById('1811.03455v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1811.03455v1-abstract-full" style="display: none;"> Single-pixel imaging (SPI) is an emerging technique which has attracts wide attention in various research fields. However, restricted by the low reconstruction quality and large amount of measurements, the practical application is still in its infancy. Inspired by the fact that natural scenes exhibit unique degenerate structures in the low dimensional subspace, we propose to take advantage of the local prior in convolutional sparse coding to implement high fidelity single-pixel imaging. Specifically, by statistically learning strategy, the target scene can be sparse represented on an overcomplete dictionary. The dictionary is composed of various basis learned from a natural image database. We introduce the above local prior into conventional SPI framework to promote the final reconstruction quality. Experiments both on synthetic data and real captured data demonstrate that our method can achieve better reconstruction from the same measurements, and thus consequently reduce the number of required measurements for same reconstruction quality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1811.03455v1-abstract-full').style.display = 'none'; document.getElementById('1811.03455v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2018. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1708.06933">arXiv:1708.06933</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1708.06933">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1007/s12043-018-1590-5">10.1007/s12043-018-1590-5 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> On Non-Consensus Motions of Dynamical Linear Multi-Agent Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Cai%2C+N">Ning Cai</a>, <a href="/search/eess?searchtype=author&amp;query=Deng%2C+C">Chun-Lin Deng</a>, <a href="/search/eess?searchtype=author&amp;query=Wu%2C+Q">Qiu-Xuan Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1708.06933v1-abstract-short" style="display: inline;"> The non-consensus problems of high order linear time-invariant dynamical homogeneous multi-agent systems are concerned. Based on the conditions of consensus achievement, the mechanisms that lead to non-consensus motions are analyzed. Besides, a comprehensive classification for diverse types of non-consensus phases in accordance to the different conditions is conducted, which is jointly depending o&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1708.06933v1-abstract-full').style.display = 'inline'; document.getElementById('1708.06933v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1708.06933v1-abstract-full" style="display: none;"> The non-consensus problems of high order linear time-invariant dynamical homogeneous multi-agent systems are concerned. Based on the conditions of consensus achievement, the mechanisms that lead to non-consensus motions are analyzed. Besides, a comprehensive classification for diverse types of non-consensus phases in accordance to the different conditions is conducted, which is jointly depending on the self-dynamics of agents, the interactive protocol and the graph topology. A series of numerical examples are demonstrated to illustrate the theoretical analysis. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1708.06933v1-abstract-full').style.display = 'none'; document.getElementById('1708.06933v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 August, 2017; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2017. </p> </li> </ol> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

Pages: 1 2 3 4 5 6 7 8 9 10