CINXE.COM
Search | arXiv e-print repository
<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 249 results for author: <span class="mathjax">Zhao, S</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> <div class="content"> <form method="GET" action="/search/eess" aria-role="search"> Searching in archive <strong>eess</strong>. <a href="/search/?searchtype=author&query=Zhao%2C+S">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Zhao, S"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Zhao%2C+S&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Zhao, S"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Zhao%2C+S&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Zhao%2C+S&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhao%2C+S&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhao%2C+S&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhao%2C+S&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhao%2C+S&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.16961">arXiv:2411.16961</a> <span> [<a href="https://arxiv.org/pdf/2411.16961">pdf</a>, <a href="https://arxiv.org/format/2411.16961">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Glo-In-One-v2: Holistic Identification of Glomerular Cells, Tissues, and Lesions in Human and Mouse Histopathology </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yu%2C+L">Lining Yu</a>, <a href="/search/eess?searchtype=author&query=Yin%2C+M">Mengmeng Yin</a>, <a href="/search/eess?searchtype=author&query=Deng%2C+R">Ruining Deng</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+Q">Quan Liu</a>, <a href="/search/eess?searchtype=author&query=Yao%2C+T">Tianyuan Yao</a>, <a href="/search/eess?searchtype=author&query=Cui%2C+C">Can Cui</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+J">Junlin Guo</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yu Wang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yaohong Wang</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+S">Shilin Zhao</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+H">Haichun Yang</a>, <a href="/search/eess?searchtype=author&query=Huo%2C+Y">Yuankai Huo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.16961v1-abstract-short" style="display: inline;"> Segmenting glomerular intraglomerular tissue and lesions traditionally depends on detailed morphological evaluations by expert nephropathologists, a labor-intensive process susceptible to interobserver variability. Our group previously developed the Glo-In-One toolkit for integrated detection and segmentation of glomeruli. In this study, we leverage the Glo-In-One toolkit to version 2 with fine-gr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16961v1-abstract-full').style.display = 'inline'; document.getElementById('2411.16961v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.16961v1-abstract-full" style="display: none;"> Segmenting glomerular intraglomerular tissue and lesions traditionally depends on detailed morphological evaluations by expert nephropathologists, a labor-intensive process susceptible to interobserver variability. Our group previously developed the Glo-In-One toolkit for integrated detection and segmentation of glomeruli. In this study, we leverage the Glo-In-One toolkit to version 2 with fine-grained segmentation capabilities, curating 14 distinct labels for tissue regions, cells, and lesions across a dataset of 23,529 annotated glomeruli across human and mouse histopathology data. To our knowledge, this dataset is among the largest of its kind to date.In this study, we present a single dynamic head deep learning architecture designed to segment 14 classes within partially labeled images of human and mouse pathology data. Our model was trained using a training set derived from 368 annotated kidney whole-slide images (WSIs) to identify 5 key intraglomerular tissues covering Bowman's capsule, glomerular tuft, mesangium, mesangial cells, and podocytes. Additionally, the network segments 9 glomerular lesion classes including adhesion, capsular drop, global sclerosis, hyalinosis, mesangial lysis, microaneurysm, nodular sclerosis, mesangial expansion, and segmental sclerosis. The glomerulus segmentation model achieved a decent performance compared with baselines, and achieved a 76.5 % average Dice Similarity Coefficient (DSC). Additional, transfer learning from rodent to human for glomerular lesion segmentation model has enhanced the average segmentation accuracy across different types of lesions by more than 3 %, as measured by Dice scores. The Glo-In-One-v2 model and trained weight have been made publicly available at https: //github.com/hrlblab/Glo-In-One_v2. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16961v1-abstract-full').style.display = 'none'; document.getElementById('2411.16961v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.16729">arXiv:2411.16729</a> <span> [<a href="https://arxiv.org/pdf/2411.16729">pdf</a>, <a href="https://arxiv.org/format/2411.16729">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> DiM-Gestor: Co-Speech Gesture Generation with Adaptive Layer Normalization Mamba-2 </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+F">Fan Zhang</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+S">Siyuan Zhao</a>, <a href="/search/eess?searchtype=author&query=Ji%2C+N">Naye Ji</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Z">Zhaohan Wang</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+J">Jingmei Wu</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+F">Fuxing Gao</a>, <a href="/search/eess?searchtype=author&query=Ye%2C+Z">Zhenqing Ye</a>, <a href="/search/eess?searchtype=author&query=Yan%2C+L">Leyao Yan</a>, <a href="/search/eess?searchtype=author&query=Dai%2C+L">Lanxin Dai</a>, <a href="/search/eess?searchtype=author&query=Geng%2C+W">Weidong Geng</a>, <a href="/search/eess?searchtype=author&query=Lyu%2C+X">Xin Lyu</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+B">Bozuo Zhao</a>, <a href="/search/eess?searchtype=author&query=Yu%2C+D">Dingguo Yu</a>, <a href="/search/eess?searchtype=author&query=Du%2C+H">Hui Du</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+B">Bin Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.16729v1-abstract-short" style="display: inline;"> Speech-driven gesture generation using transformer-based generative models represents a rapidly advancing area within virtual human creation. However, existing models face significant challenges due to their quadratic time and space complexities, limiting scalability and efficiency. To address these limitations, we introduce DiM-Gestor, an innovative end-to-end generative model leveraging the Mamb… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16729v1-abstract-full').style.display = 'inline'; document.getElementById('2411.16729v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.16729v1-abstract-full" style="display: none;"> Speech-driven gesture generation using transformer-based generative models represents a rapidly advancing area within virtual human creation. However, existing models face significant challenges due to their quadratic time and space complexities, limiting scalability and efficiency. To address these limitations, we introduce DiM-Gestor, an innovative end-to-end generative model leveraging the Mamba-2 architecture. DiM-Gestor features a dual-component framework: (1) a fuzzy feature extractor and (2) a speech-to-gesture mapping module, both built on the Mamba-2. The fuzzy feature extractor, integrated with a Chinese Pre-trained Model and Mamba-2, autonomously extracts implicit, continuous speech features. These features are synthesized into a unified latent representation and then processed by the speech-to-gesture mapping module. This module employs an Adaptive Layer Normalization (AdaLN)-enhanced Mamba-2 mechanism to uniformly apply transformations across all sequence tokens. This enables precise modeling of the nuanced interplay between speech features and gesture dynamics. We utilize a diffusion model to train and infer diverse gesture outputs. Extensive subjective and objective evaluations conducted on the newly released Chinese Co-Speech Gestures dataset corroborate the efficacy of our proposed model. Compared with Transformer-based architecture, the assessments reveal that our approach delivers competitive results and significantly reduces memory usage, approximately 2.4 times, and enhances inference speeds by 2 to 4 times. Additionally, we released the CCG dataset, a Chinese Co-Speech Gestures dataset, comprising 15.97 hours (six styles across five scenarios) of 3D full-body skeleton gesture motion performed by professional Chinese TV broadcasters. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16729v1-abstract-full').style.display = 'none'; document.getElementById('2411.16729v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 11 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11762">arXiv:2411.11762</a> <span> [<a href="https://arxiv.org/pdf/2411.11762">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> High-Speed Cornering Control and Real-Vehicle Deployment for Autonomous Electric Vehicles </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhao%2C+S">Shiyue Zhao</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+J">Junzhi Zhang</a>, <a href="/search/eess?searchtype=author&query=Masoud%2C+N">Neda Masoud</a>, <a href="/search/eess?searchtype=author&query=Jiang%2C+Y">Yuhong Jiang</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+H">Heye Huang</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+T">Tao Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11762v2-abstract-short" style="display: inline;"> Executing drift maneuvers during high-speed cornering presents significant challenges for autonomous vehicles, yet offers the potential to minimize turning time and enhance driving dynamics. While reinforcement learning (RL) has shown promising results in simulated environments, discrepancies between simulations and real-world conditions have limited its practical deployment. This study introduces… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11762v2-abstract-full').style.display = 'inline'; document.getElementById('2411.11762v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11762v2-abstract-full" style="display: none;"> Executing drift maneuvers during high-speed cornering presents significant challenges for autonomous vehicles, yet offers the potential to minimize turning time and enhance driving dynamics. While reinforcement learning (RL) has shown promising results in simulated environments, discrepancies between simulations and real-world conditions have limited its practical deployment. This study introduces an innovative control framework that integrates trajectory optimization with drift maneuvers, aiming to improve the algorithm's adaptability for real-vehicle implementation. We leveraged Bezier-based pre-trajectory optimization to enhance rewards and optimize the controller through Twin Delayed Deep Deterministic Policy Gradient (TD3) in a simulated environment. For real-world deployment, we implement a hybrid RL-MPC fusion mechanism, , where TD3-derived maneuvers serve as primary inputs for a Model Predictive Controller (MPC). This integration enables precise real-time tracking of the optimal trajectory, with MPC providing corrective inputs to bridge the gap between simulation and reality. The efficacy of this method is validated through real-vehicle tests on consumer-grade electric vehicles, focusing on drift U-turns and drift right-angle turns. The control outcomes of these real-vehicle tests are thoroughly documented in the paper, supported by supplementary video evidence (https://youtu.be/5wp67FcpfL8). Notably, this study is the first to deploy and apply an RL-based transient drift cornering algorithm on consumer-grade electric vehicles. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11762v2-abstract-full').style.display = 'none'; document.getElementById('2411.11762v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">In the process of being submitted to the Journal of IEEE Transactions on Industrial Electronics</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10570">arXiv:2411.10570</a> <span> [<a href="https://arxiv.org/pdf/2411.10570">pdf</a>, <a href="https://arxiv.org/format/2411.10570">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Normative Modeling for AD Diagnosis and Biomarker Identification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhao%2C+S">Songlin Zhao</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+R">Rong Zhou</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Y">Yu Zhang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yong Chen</a>, <a href="/search/eess?searchtype=author&query=He%2C+L">Lifang He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10570v1-abstract-short" style="display: inline;"> In this paper, we introduce a novel normative modeling approach that incorporates focal loss and adversarial autoencoders (FAAE) for Alzheimer's Disease (AD) diagnosis and biomarker identification. Our method is an end-to-end approach that embeds an adversarial focal loss discriminator within the autoencoder structure, specifically designed to effectively target and capture more complex and challe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10570v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10570v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10570v1-abstract-full" style="display: none;"> In this paper, we introduce a novel normative modeling approach that incorporates focal loss and adversarial autoencoders (FAAE) for Alzheimer's Disease (AD) diagnosis and biomarker identification. Our method is an end-to-end approach that embeds an adversarial focal loss discriminator within the autoencoder structure, specifically designed to effectively target and capture more complex and challenging cases. We first use the enhanced autoencoder to create a normative model based on data from healthy control (HC) individuals. We then apply this model to estimate total and regional neuroanatomical deviation in AD patients. Through extensive experiments on the OASIS-3 and ADNI datasets, our approach significantly outperforms previous state-of-the-art methods. This advancement not only streamlines the detection process but also provides a greater insight into the biomarker potential for AD. Our code can be found at \url{https://github.com/soz223/FAAE}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10570v1-abstract-full').style.display = 'none'; document.getElementById('2411.10570v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 3 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00078">arXiv:2411.00078</a> <span> [<a href="https://arxiv.org/pdf/2411.00078">pdf</a>, <a href="https://arxiv.org/format/2411.00078">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> How Good Are We? Evaluating Cell AI Foundation Models in Kidney Pathology with Human-in-the-Loop Enrichment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Guo%2C+J">Junlin Guo</a>, <a href="/search/eess?searchtype=author&query=Lu%2C+S">Siqi Lu</a>, <a href="/search/eess?searchtype=author&query=Cui%2C+C">Can Cui</a>, <a href="/search/eess?searchtype=author&query=Deng%2C+R">Ruining Deng</a>, <a href="/search/eess?searchtype=author&query=Yao%2C+T">Tianyuan Yao</a>, <a href="/search/eess?searchtype=author&query=Tao%2C+Z">Zhewen Tao</a>, <a href="/search/eess?searchtype=author&query=Lin%2C+Y">Yizhe Lin</a>, <a href="/search/eess?searchtype=author&query=Lionts%2C+M">Marilyn Lionts</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+Q">Quan Liu</a>, <a href="/search/eess?searchtype=author&query=Xiong%2C+J">Juming Xiong</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yu Wang</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+S">Shilin Zhao</a>, <a href="/search/eess?searchtype=author&query=Chang%2C+C">Catie Chang</a>, <a href="/search/eess?searchtype=author&query=Wilkes%2C+M">Mitchell Wilkes</a>, <a href="/search/eess?searchtype=author&query=Yin%2C+M">Mengmeng Yin</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+H">Haichun Yang</a>, <a href="/search/eess?searchtype=author&query=Huo%2C+Y">Yuankai Huo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00078v1-abstract-short" style="display: inline;"> Training AI foundation models has emerged as a promising large-scale learning approach for addressing real-world healthcare challenges, including digital pathology. While many of these models have been developed for tasks like disease diagnosis and tissue quantification using extensive and diverse training datasets, their readiness for deployment on some arguably simplest tasks, such as nuclei seg… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00078v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00078v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00078v1-abstract-full" style="display: none;"> Training AI foundation models has emerged as a promising large-scale learning approach for addressing real-world healthcare challenges, including digital pathology. While many of these models have been developed for tasks like disease diagnosis and tissue quantification using extensive and diverse training datasets, their readiness for deployment on some arguably simplest tasks, such as nuclei segmentation within a single organ (e.g., the kidney), remains uncertain. This paper seeks to answer this key question, "How good are we?", by thoroughly evaluating the performance of recent cell foundation models on a curated multi-center, multi-disease, and multi-species external testing dataset. Additionally, we tackle a more challenging question, "How can we improve?", by developing and assessing human-in-the-loop data enrichment strategies aimed at enhancing model performance while minimizing the reliance on pixel-level human annotation. To address the first question, we curated a multicenter, multidisease, and multispecies dataset consisting of 2,542 kidney whole slide images (WSIs). Three state-of-the-art (SOTA) cell foundation models-Cellpose, StarDist, and CellViT-were selected for evaluation. To tackle the second question, we explored data enrichment algorithms by distilling predictions from the different foundation models with a human-in-the-loop framework, aiming to further enhance foundation model performance with minimal human efforts. Our experimental results showed that all three foundation models improved over their baselines with model fine-tuning with enriched data. Interestingly, the baseline model with the highest F1 score does not yield the best segmentation outcomes after fine-tuning. This study establishes a benchmark for the development and deployment of cell vision foundation models tailored for real-world data applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00078v1-abstract-full').style.display = 'none'; document.getElementById('2411.00078v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23695">arXiv:2410.23695</a> <span> [<a href="https://arxiv.org/pdf/2410.23695">pdf</a>, <a href="https://arxiv.org/format/2410.23695">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Parameterized TDOA: Instantaneous TDOA Estimation and Localization for Mobile Targets in a Time-Division Broadcast Positioning System </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Tu%2C+C">Chenxin Tu</a>, <a href="/search/eess?searchtype=author&query=Cui%2C+X">Xiaowei Cui</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+G">Gang Liu</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+S">Sihao Zhao</a>, <a href="/search/eess?searchtype=author&query=Lu%2C+M">Mingquan Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23695v1-abstract-short" style="display: inline;"> Localization of mobile targets is a fundamental problem across various domains. One-way ranging-based downlink localization has gained significant attention due to its ability to support an unlimited number of targets and enable autonomous navigation by performing localization at the target side. Time-difference-of-arrival (TDOA)-based methods are particularly advantageous as they obviate the need… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23695v1-abstract-full').style.display = 'inline'; document.getElementById('2410.23695v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23695v1-abstract-full" style="display: none;"> Localization of mobile targets is a fundamental problem across various domains. One-way ranging-based downlink localization has gained significant attention due to its ability to support an unlimited number of targets and enable autonomous navigation by performing localization at the target side. Time-difference-of-arrival (TDOA)-based methods are particularly advantageous as they obviate the need for target-anchor synchronization, unlike time-of-arrival (TOA)-based approaches. However, existing TDOA estimation methods inherently rely on the quasi-static assumption (QSA), which assumes that targets remain stationary during the measurement period, thereby limiting their applicability in dynamic environments. In this paper, we propose a novel instantaneous TDOA estimation method for dynamic environments, termed Parameterized TDOA (P-TDOA). We first characterize the nonlinear, time-varying TDOA measurements using polynomial models and construct a system of linear equations for the model parameters through dedicated transformations, employing a novel successive time difference strategy (STDS). Subsequently, we solve the parameters with a weighted least squares (WLS) solution, thereby obtaining instantaneous TDOA estimates. Furthermore, we develop a mobile target localization approach that leverages instantaneous TDOA estimates from multiple anchor pairs at the same instant. Theoretical analysis shows that our proposed method can approach the Cramer-Rao lower bound (CRLB) of instantaneous TDOA estimation and localization in concurrent TOA scenarios, despite actual TOA measurements being obtained sequentially. Extensive numerical simulations validate our theoretical analysis and demonstrate the effectiveness of the proposed method, highlighting its superiority over state-of-the-art approaches across various scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23695v1-abstract-full').style.display = 'none'; document.getElementById('2410.23695v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This work has been submitted to the IEEE for possible publication</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21276">arXiv:2410.21276</a> <span> [<a href="https://arxiv.org/pdf/2410.21276">pdf</a>, <a href="https://arxiv.org/format/2410.21276">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> GPT-4o System Card </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=OpenAI"> OpenAI</a>, <a href="/search/eess?searchtype=author&query=%3A"> :</a>, <a href="/search/eess?searchtype=author&query=Hurst%2C+A">Aaron Hurst</a>, <a href="/search/eess?searchtype=author&query=Lerer%2C+A">Adam Lerer</a>, <a href="/search/eess?searchtype=author&query=Goucher%2C+A+P">Adam P. Goucher</a>, <a href="/search/eess?searchtype=author&query=Perelman%2C+A">Adam Perelman</a>, <a href="/search/eess?searchtype=author&query=Ramesh%2C+A">Aditya Ramesh</a>, <a href="/search/eess?searchtype=author&query=Clark%2C+A">Aidan Clark</a>, <a href="/search/eess?searchtype=author&query=Ostrow%2C+A">AJ Ostrow</a>, <a href="/search/eess?searchtype=author&query=Welihinda%2C+A">Akila Welihinda</a>, <a href="/search/eess?searchtype=author&query=Hayes%2C+A">Alan Hayes</a>, <a href="/search/eess?searchtype=author&query=Radford%2C+A">Alec Radford</a>, <a href="/search/eess?searchtype=author&query=M%C4%85dry%2C+A">Aleksander M膮dry</a>, <a href="/search/eess?searchtype=author&query=Baker-Whitcomb%2C+A">Alex Baker-Whitcomb</a>, <a href="/search/eess?searchtype=author&query=Beutel%2C+A">Alex Beutel</a>, <a href="/search/eess?searchtype=author&query=Borzunov%2C+A">Alex Borzunov</a>, <a href="/search/eess?searchtype=author&query=Carney%2C+A">Alex Carney</a>, <a href="/search/eess?searchtype=author&query=Chow%2C+A">Alex Chow</a>, <a href="/search/eess?searchtype=author&query=Kirillov%2C+A">Alex Kirillov</a>, <a href="/search/eess?searchtype=author&query=Nichol%2C+A">Alex Nichol</a>, <a href="/search/eess?searchtype=author&query=Paino%2C+A">Alex Paino</a>, <a href="/search/eess?searchtype=author&query=Renzin%2C+A">Alex Renzin</a>, <a href="/search/eess?searchtype=author&query=Passos%2C+A+T">Alex Tachard Passos</a>, <a href="/search/eess?searchtype=author&query=Kirillov%2C+A">Alexander Kirillov</a>, <a href="/search/eess?searchtype=author&query=Christakis%2C+A">Alexi Christakis</a> , et al. (395 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21276v1-abstract-short" style="display: inline;"> GPT-4o is an autoregressive omni model that accepts as input any combination of text, audio, image, and video, and generates any combination of text, audio, and image outputs. It's trained end-to-end across text, vision, and audio, meaning all inputs and outputs are processed by the same neural network. GPT-4o can respond to audio inputs in as little as 232 milliseconds, with an average of 320 mil… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21276v1-abstract-full').style.display = 'inline'; document.getElementById('2410.21276v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21276v1-abstract-full" style="display: none;"> GPT-4o is an autoregressive omni model that accepts as input any combination of text, audio, image, and video, and generates any combination of text, audio, and image outputs. It's trained end-to-end across text, vision, and audio, meaning all inputs and outputs are processed by the same neural network. GPT-4o can respond to audio inputs in as little as 232 milliseconds, with an average of 320 milliseconds, which is similar to human response time in conversation. It matches GPT-4 Turbo performance on text in English and code, with significant improvement on text in non-English languages, while also being much faster and 50\% cheaper in the API. GPT-4o is especially better at vision and audio understanding compared to existing models. In line with our commitment to building AI safely and consistent with our voluntary commitments to the White House, we are sharing the GPT-4o System Card, which includes our Preparedness Framework evaluations. In this System Card, we provide a detailed look at GPT-4o's capabilities, limitations, and safety evaluations across multiple categories, focusing on speech-to-speech while also evaluating text and image capabilities, and measures we've implemented to ensure the model is safe and aligned. We also include third-party assessments on dangerous capabilities, as well as discussion of potential societal impacts of GPT-4o's text and vision capabilities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21276v1-abstract-full').style.display = 'none'; document.getElementById('2410.21276v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.18584">arXiv:2409.18584</a> <span> [<a href="https://arxiv.org/pdf/2409.18584">pdf</a>, <a href="https://arxiv.org/format/2409.18584">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> ChildMandarin: A Comprehensive Mandarin Speech Dataset for Young Children Aged 3-5 </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhou%2C+J">Jiaming Zhou</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+S">Shiyao Wang</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+S">Shiwan Zhao</a>, <a href="/search/eess?searchtype=author&query=He%2C+J">Jiabei He</a>, <a href="/search/eess?searchtype=author&query=Sun%2C+H">Haoqin Sun</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+H">Hui Wang</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+C">Cheng Liu</a>, <a href="/search/eess?searchtype=author&query=Kong%2C+A">Aobo Kong</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+Y">Yujie Guo</a>, <a href="/search/eess?searchtype=author&query=Qin%2C+Y">Yong Qin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.18584v2-abstract-short" style="display: inline;"> Automatic speech recognition (ASR) systems have advanced significantly with models like Whisper, Conformer, and self-supervised frameworks such as Wav2vec 2.0 and HuBERT. However, developing robust ASR models for young children's speech remains challenging due to differences in pronunciation, tone, and pace compared to adult speech. In this paper, we introduce a new Mandarin speech dataset focused… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18584v2-abstract-full').style.display = 'inline'; document.getElementById('2409.18584v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.18584v2-abstract-full" style="display: none;"> Automatic speech recognition (ASR) systems have advanced significantly with models like Whisper, Conformer, and self-supervised frameworks such as Wav2vec 2.0 and HuBERT. However, developing robust ASR models for young children's speech remains challenging due to differences in pronunciation, tone, and pace compared to adult speech. In this paper, we introduce a new Mandarin speech dataset focused on children aged 3 to 5, addressing the scarcity of resources in this area. The dataset comprises 41.25 hours of speech with carefully crafted manual transcriptions, collected from 397 speakers across various provinces in China, with balanced gender representation. We provide a comprehensive analysis of speaker demographics, speech duration distribution and geographic coverage. Additionally, we evaluate ASR performance on models trained from scratch, such as Conformer, as well as fine-tuned pre-trained models like HuBERT and Whisper, where fine-tuning demonstrates significant performance improvements. Furthermore, we assess speaker verification (SV) on our dataset, showing that, despite the challenges posed by the unique vocal characteristics of young children, the dataset effectively supports both ASR and SV tasks. This dataset is a valuable contribution to Mandarin child speech research and holds potential for applications in educational technology and child-computer interaction. It will be open-source and freely available for all academic purposes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18584v2-abstract-full').style.display = 'none'; document.getElementById('2409.18584v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.16681">arXiv:2409.16681</a> <span> [<a href="https://arxiv.org/pdf/2409.16681">pdf</a>, <a href="https://arxiv.org/format/2409.16681">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Emotional Dimension Control in Language Model-Based Text-to-Speech: Spanning a Broad Spectrum of Human Emotions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhou%2C+K">Kun Zhou</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Y">You Zhang</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+S">Shengkui Zhao</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+H">Hao Wang</a>, <a href="/search/eess?searchtype=author&query=Pan%2C+Z">Zexu Pan</a>, <a href="/search/eess?searchtype=author&query=Ng%2C+D">Dianwen Ng</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+C">Chong Zhang</a>, <a href="/search/eess?searchtype=author&query=Ni%2C+C">Chongjia Ni</a>, <a href="/search/eess?searchtype=author&query=Ma%2C+Y">Yukun Ma</a>, <a href="/search/eess?searchtype=author&query=Nguyen%2C+T+H">Trung Hieu Nguyen</a>, <a href="/search/eess?searchtype=author&query=Yip%2C+J+Q">Jia Qi Yip</a>, <a href="/search/eess?searchtype=author&query=Ma%2C+B">Bin Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.16681v1-abstract-short" style="display: inline;"> Current emotional text-to-speech (TTS) systems face challenges in mimicking a broad spectrum of human emotions due to the inherent complexity of emotions and limitations in emotional speech datasets and models. This paper proposes a TTS framework that facilitates control over pleasure, arousal, and dominance, and can synthesize a diversity of emotional styles without requiring any emotional speech… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.16681v1-abstract-full').style.display = 'inline'; document.getElementById('2409.16681v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.16681v1-abstract-full" style="display: none;"> Current emotional text-to-speech (TTS) systems face challenges in mimicking a broad spectrum of human emotions due to the inherent complexity of emotions and limitations in emotional speech datasets and models. This paper proposes a TTS framework that facilitates control over pleasure, arousal, and dominance, and can synthesize a diversity of emotional styles without requiring any emotional speech data during TTS training. We train an emotional attribute predictor using only categorical labels from speech data, aligning with psychological research and incorporating anchored dimensionality reduction on self-supervised learning (SSL) features. The TTS framework converts text inputs into phonetic tokens via an autoregressive language model and uses pseudo-emotional dimensions to guide the parallel prediction of fine-grained acoustic details. Experiments conducted on the LibriTTS dataset demonstrate that our framework can synthesize speech with enhanced naturalness and a variety of emotional styles by effectively controlling emotional dimensions, even without the inclusion of any emotional speech during TTS training. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.16681v1-abstract-full').style.display = 'none'; document.getElementById('2409.16681v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">submitted to ICASSP 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.12992">arXiv:2409.12992</a> <span> [<a href="https://arxiv.org/pdf/2409.12992">pdf</a>, <a href="https://arxiv.org/format/2409.12992">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> DiffEditor: Enhancing Speech Editing with Semantic Enrichment and Acoustic Consistency </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yang Chen</a>, <a href="/search/eess?searchtype=author&query=Jia%2C+Y">Yuhang Jia</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+S">Shiwan Zhao</a>, <a href="/search/eess?searchtype=author&query=Jiang%2C+Z">Ziyue Jiang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+H">Haoran Li</a>, <a href="/search/eess?searchtype=author&query=Kang%2C+J">Jiarong Kang</a>, <a href="/search/eess?searchtype=author&query=Qin%2C+Y">Yong Qin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.12992v1-abstract-short" style="display: inline;"> As text-based speech editing becomes increasingly prevalent, the demand for unrestricted free-text editing continues to grow. However, existing speech editing techniques encounter significant challenges, particularly in maintaining intelligibility and acoustic consistency when dealing with out-of-domain (OOD) text. In this paper, we introduce, DiffEditor, a novel speech editing model designed to e… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12992v1-abstract-full').style.display = 'inline'; document.getElementById('2409.12992v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.12992v1-abstract-full" style="display: none;"> As text-based speech editing becomes increasingly prevalent, the demand for unrestricted free-text editing continues to grow. However, existing speech editing techniques encounter significant challenges, particularly in maintaining intelligibility and acoustic consistency when dealing with out-of-domain (OOD) text. In this paper, we introduce, DiffEditor, a novel speech editing model designed to enhance performance in OOD text scenarios through semantic enrichment and acoustic consistency. To improve the intelligibility of the edited speech, we enrich the semantic information of phoneme embeddings by integrating word embeddings extracted from a pretrained language model. Furthermore, we emphasize that interframe smoothing properties are critical for modeling acoustic consistency, and thus we propose a first-order loss function to promote smoother transitions at editing boundaries and enhance the overall fluency of the edited speech. Experimental results demonstrate that our model achieves state-of-the-art performance in both in-domain and OOD text scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12992v1-abstract-full').style.display = 'none'; document.getElementById('2409.12992v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.12466">arXiv:2409.12466</a> <span> [<a href="https://arxiv.org/pdf/2409.12466">pdf</a>, <a href="https://arxiv.org/format/2409.12466">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> AudioEditor: A Training-Free Diffusion-Based Audio Editing Framework </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Jia%2C+Y">Yuhang Jia</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yang Chen</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+J">Jinghua Zhao</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+S">Shiwan Zhao</a>, <a href="/search/eess?searchtype=author&query=Zeng%2C+W">Wenjia Zeng</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yong Chen</a>, <a href="/search/eess?searchtype=author&query=Qin%2C+Y">Yong Qin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.12466v2-abstract-short" style="display: inline;"> Diffusion-based text-to-audio (TTA) generation has made substantial progress, leveraging latent diffusion model (LDM) to produce high-quality, diverse and instruction-relevant audios. However, beyond generation, the task of audio editing remains equally important but has received comparatively little attention. Audio editing tasks face two primary challenges: executing precise edits and preserving… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12466v2-abstract-full').style.display = 'inline'; document.getElementById('2409.12466v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.12466v2-abstract-full" style="display: none;"> Diffusion-based text-to-audio (TTA) generation has made substantial progress, leveraging latent diffusion model (LDM) to produce high-quality, diverse and instruction-relevant audios. However, beyond generation, the task of audio editing remains equally important but has received comparatively little attention. Audio editing tasks face two primary challenges: executing precise edits and preserving the unedited sections. While workflows based on LDMs have effectively addressed these challenges in the field of image processing, similar approaches have been scarcely applied to audio editing. In this paper, we introduce AudioEditor, a training-free audio editing framework built on the pretrained diffusion-based TTA model. AudioEditor incorporates Null-text Inversion and EOT-suppression methods, enabling the model to preserve original audio features while executing accurate edits. Comprehensive objective and subjective experiments validate the effectiveness of AudioEditor in delivering high-quality audio edits. Code and demo can be found at https://github.com/NKU-HLT/AudioEditor. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12466v2-abstract-full').style.display = 'none'; document.getElementById('2409.12466v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.11889">arXiv:2409.11889</a> <span> [<a href="https://arxiv.org/pdf/2409.11889">pdf</a>, <a href="https://arxiv.org/format/2409.11889">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> M2R-Whisper: Multi-stage and Multi-scale Retrieval Augmentation for Enhancing Whisper </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhou%2C+J">Jiaming Zhou</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+S">Shiwan Zhao</a>, <a href="/search/eess?searchtype=author&query=He%2C+J">Jiabei He</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+H">Hui Wang</a>, <a href="/search/eess?searchtype=author&query=Zeng%2C+W">Wenjia Zeng</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yong Chen</a>, <a href="/search/eess?searchtype=author&query=Sun%2C+H">Haoqin Sun</a>, <a href="/search/eess?searchtype=author&query=Kong%2C+A">Aobo Kong</a>, <a href="/search/eess?searchtype=author&query=Qin%2C+Y">Yong Qin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.11889v1-abstract-short" style="display: inline;"> State-of-the-art models like OpenAI's Whisper exhibit strong performance in multilingual automatic speech recognition (ASR), but they still face challenges in accurately recognizing diverse subdialects. In this paper, we propose M2R-whisper, a novel multi-stage and multi-scale retrieval augmentation approach designed to enhance ASR performance in low-resource settings. Building on the principles o… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.11889v1-abstract-full').style.display = 'inline'; document.getElementById('2409.11889v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.11889v1-abstract-full" style="display: none;"> State-of-the-art models like OpenAI's Whisper exhibit strong performance in multilingual automatic speech recognition (ASR), but they still face challenges in accurately recognizing diverse subdialects. In this paper, we propose M2R-whisper, a novel multi-stage and multi-scale retrieval augmentation approach designed to enhance ASR performance in low-resource settings. Building on the principles of in-context learning (ICL) and retrieval-augmented techniques, our method employs sentence-level ICL in the pre-processing stage to harness contextual information, while integrating token-level k-Nearest Neighbors (kNN) retrieval as a post-processing step to further refine the final output distribution. By synergistically combining sentence-level and token-level retrieval strategies, M2R-whisper effectively mitigates various types of recognition errors. Experiments conducted on Mandarin and subdialect datasets, including AISHELL-1 and KeSpeech, demonstrate substantial improvements in ASR accuracy, all achieved without any parameter updates. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.11889v1-abstract-full').style.display = 'none'; document.getElementById('2409.11889v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.11508">arXiv:2409.11508</a> <span> [<a href="https://arxiv.org/pdf/2409.11508">pdf</a>, <a href="https://arxiv.org/format/2409.11508">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Retinal Vessel Segmentation with Deep Graph and Capsule Reasoning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wei%2C+X">Xinxu Wei</a>, <a href="/search/eess?searchtype=author&query=Lin%2C+X">Xi Lin</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+H">Haiyun Liu</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+S">Shixuan Zhao</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yongjie Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.11508v1-abstract-short" style="display: inline;"> Effective retinal vessel segmentation requires a sophisticated integration of global contextual awareness and local vessel continuity. To address this challenge, we propose the Graph Capsule Convolution Network (GCC-UNet), which merges capsule convolutions with CNNs to capture both local and global features. The Graph Capsule Convolution operator is specifically designed to enhance the representat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.11508v1-abstract-full').style.display = 'inline'; document.getElementById('2409.11508v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.11508v1-abstract-full" style="display: none;"> Effective retinal vessel segmentation requires a sophisticated integration of global contextual awareness and local vessel continuity. To address this challenge, we propose the Graph Capsule Convolution Network (GCC-UNet), which merges capsule convolutions with CNNs to capture both local and global features. The Graph Capsule Convolution operator is specifically designed to enhance the representation of global context, while the Selective Graph Attention Fusion module ensures seamless integration of local and global information. To further improve vessel continuity, we introduce the Bottleneck Graph Attention module, which incorporates Channel-wise and Spatial Graph Attention mechanisms. The Multi-Scale Graph Fusion module adeptly combines features from various scales. Our approach has been rigorously validated through experiments on widely used public datasets, with ablation studies confirming the efficacy of each component. Comparative results highlight GCC-UNet's superior performance over existing methods, setting a new benchmark in retinal vessel segmentation. Notably, this work represents the first integration of vanilla, graph, and capsule convolutional techniques in the domain of medical image segmentation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.11508v1-abstract-full').style.display = 'none'; document.getElementById('2409.11508v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.05096">arXiv:2409.05096</a> <span> [<a href="https://arxiv.org/pdf/2409.05096">pdf</a>, <a href="https://arxiv.org/format/2409.05096">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TNSM.2024.3457579">10.1109/TNSM.2024.3457579 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Time-Distributed Feature Learning for Internet of Things Network Traffic Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Manjunath%2C+Y+S+K">Yoga Suhas Kuruba Manjunath</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+S">Sihao Zhao</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+X">Xiao-Ping Zhang</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+L">Lian Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.05096v1-abstract-short" style="display: inline;"> Deep learning-based network traffic classification (NTC) techniques, including conventional and class-of-service (CoS) classifiers, are a popular tool that aids in the quality of service (QoS) and radio resource management for the Internet of Things (IoT) network. Holistic temporal features consist of inter-, intra-, and pseudo-temporal features within packets, between packets, and among flows, pr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05096v1-abstract-full').style.display = 'inline'; document.getElementById('2409.05096v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.05096v1-abstract-full" style="display: none;"> Deep learning-based network traffic classification (NTC) techniques, including conventional and class-of-service (CoS) classifiers, are a popular tool that aids in the quality of service (QoS) and radio resource management for the Internet of Things (IoT) network. Holistic temporal features consist of inter-, intra-, and pseudo-temporal features within packets, between packets, and among flows, providing the maximum information on network services without depending on defined classes in a problem. Conventional spatio-temporal features in the current solutions extract only space and time information between packets and flows, ignoring the information within packets and flow for IoT traffic. Therefore, we propose a new, efficient, holistic feature extraction method for deep-learning-based NTC using time-distributed feature learning to maximize the accuracy of the NTC. We apply a time-distributed wrapper on deep-learning layers to help extract pseudo-temporal features and spatio-temporal features. Pseudo-temporal features are mathematically complex to explain since, in deep learning, a black box extracts them. However, the features are temporal because of the time-distributed wrapper; therefore, we call them pseudo-temporal features. Since our method is efficient in learning holistic-temporal features, we can extend our method to both conventional and CoS NTC. Our solution proves that pseudo-temporal and spatial-temporal features can significantly improve the robustness and performance of any NTC. We analyze the solution theoretically and experimentally on different real-world datasets. The experimental results show that the holistic-temporal time-distributed feature learning method, on average, is 13.5% more accurate than the state-of-the-art conventional and CoS classifiers. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05096v1-abstract-full').style.display = 'none'; document.getElementById('2409.05096v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.04799">arXiv:2409.04799</a> <span> [<a href="https://arxiv.org/pdf/2409.04799">pdf</a>, <a href="https://arxiv.org/format/2409.04799">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> PB-LRDWWS System for the SLT 2024 Low-Resource Dysarthria Wake-Up Word Spotting Challenge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+S">Shiyao Wang</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+J">Jiaming Zhou</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+S">Shiwan Zhao</a>, <a href="/search/eess?searchtype=author&query=Qin%2C+Y">Yong Qin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.04799v1-abstract-short" style="display: inline;"> For the SLT 2024 Low-Resource Dysarthria Wake-Up Word Spotting (LRDWWS) Challenge, we introduce the PB-LRDWWS system. This system combines a dysarthric speech content feature extractor for prototype construction with a prototype-based classification method. The feature extractor is a fine-tuned HuBERT model obtained through a three-stage fine-tuning process using cross-entropy loss. This fine-tune… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.04799v1-abstract-full').style.display = 'inline'; document.getElementById('2409.04799v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.04799v1-abstract-full" style="display: none;"> For the SLT 2024 Low-Resource Dysarthria Wake-Up Word Spotting (LRDWWS) Challenge, we introduce the PB-LRDWWS system. This system combines a dysarthric speech content feature extractor for prototype construction with a prototype-based classification method. The feature extractor is a fine-tuned HuBERT model obtained through a three-stage fine-tuning process using cross-entropy loss. This fine-tuned HuBERT extracts features from the target dysarthric speaker's enrollment speech to build prototypes. Classification is achieved by calculating the cosine similarity between the HuBERT features of the target dysarthric speaker's evaluation speech and prototypes. Despite its simplicity, our method demonstrates effectiveness through experimental results. Our system achieves second place in the final Test-B of the LRDWWS Challenge. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.04799v1-abstract-full').style.display = 'none'; document.getElementById('2409.04799v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accept by SLT 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.04353">arXiv:2409.04353</a> <span> [<a href="https://arxiv.org/pdf/2409.04353">pdf</a>, <a href="https://arxiv.org/format/2409.04353">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Whole Heart Perfusion with High-Multiband Simultaneous Multislice Imaging via Linear Phase Modulated Extended Field of View (SMILE) </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhao%2C+S">Shen Zhao</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+J">Junyu Wang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+X">Xitong Wang</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+S">Sizhuo Liu</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Q">Quan Chen</a>, <a href="/search/eess?searchtype=author&query=Salerno%2C+M">Michael Salerno</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.04353v1-abstract-short" style="display: inline;"> Purpose: To develop a simultaneous multislice (SMS) first-pass perfusion technique that can achieve whole heart coverage with high multi-band factors, while avoiding the issue of slice leakage. Methods: The proposed Simultaneous Multislice Imaging via Linear phase modulated Extended field of view (SMILE) treats the SMS acquisition and reconstruction within an extended field of view framework, allo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.04353v1-abstract-full').style.display = 'inline'; document.getElementById('2409.04353v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.04353v1-abstract-full" style="display: none;"> Purpose: To develop a simultaneous multislice (SMS) first-pass perfusion technique that can achieve whole heart coverage with high multi-band factors, while avoiding the issue of slice leakage. Methods: The proposed Simultaneous Multislice Imaging via Linear phase modulated Extended field of view (SMILE) treats the SMS acquisition and reconstruction within an extended field of view framework, allowing arbitrarily under-sampling of phase encoding lines of the extended k-space matrix and enabling the direct application of 2D parallel imaging reconstruction techniques. We presented a theoretical framework that offers insights into the performance of SMILE. We performed retrospective comparison on 28 subjects and prospective perfusion experiments on 49 patients undergoing routine clinical CMR studies with SMILE at multiband (MB) factors of 3-5, with a total acceleration factor ($R$) of 8 and 10 respectively, and compared SMILE to conventional SMS techniques using standard FOV 2D CAIPI acquisition and standard 2D slice separation techniques including split-slice GRAPPA and ROCK-SPIRiT. Results: Retrospective studies demonstrated 5.2 to 8.0 dB improvement in signal to error ratio (SER) of SMILE over CAIPI perfusion. Prospective studies showed good image quality with grades of 4.5 $\pm$ 0.5 for MB=3, $R$=8 and 3.6 $\pm$ 0.8 for MB=5, $R$=10. (5-point Likert Scale) Conclusion: The theoretical derivation and experimental results validate the SMILE's improved performance at high acceleration and MB factors as compared to the existing 2D CAIPI SMS acquisition and reconstruction techniques for first-pass myocardial perfusion imaging. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.04353v1-abstract-full').style.display = 'none'; document.getElementById('2409.04353v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages, 12 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.04016">arXiv:2409.04016</a> <span> [<a href="https://arxiv.org/pdf/2409.04016">pdf</a>, <a href="https://arxiv.org/format/2409.04016">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Investigating Neural Audio Codecs for Speech Language Model-Based Speech Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+J">Jiaqi Li</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+D">Dongmei Wang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+X">Xiaofei Wang</a>, <a href="/search/eess?searchtype=author&query=Qian%2C+Y">Yao Qian</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+L">Long Zhou</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+S">Shujie Liu</a>, <a href="/search/eess?searchtype=author&query=Yousefi%2C+M">Midia Yousefi</a>, <a href="/search/eess?searchtype=author&query=Li%2C+C">Canrun Li</a>, <a href="/search/eess?searchtype=author&query=Tsai%2C+C">Chung-Hsien Tsai</a>, <a href="/search/eess?searchtype=author&query=Xiao%2C+Z">Zhen Xiao</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+Y">Yanqing Liu</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+J">Junkun Chen</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+S">Sheng Zhao</a>, <a href="/search/eess?searchtype=author&query=Li%2C+J">Jinyu Li</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+Z">Zhizheng Wu</a>, <a href="/search/eess?searchtype=author&query=Zeng%2C+M">Michael Zeng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.04016v1-abstract-short" style="display: inline;"> Neural audio codec tokens serve as the fundamental building blocks for speech language model (SLM)-based speech generation. However, there is no systematic understanding on how the codec system affects the speech generation performance of the SLM. In this work, we examine codec tokens within SLM framework for speech generation to provide insights for effective codec design. We retrain existing hig… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.04016v1-abstract-full').style.display = 'inline'; document.getElementById('2409.04016v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.04016v1-abstract-full" style="display: none;"> Neural audio codec tokens serve as the fundamental building blocks for speech language model (SLM)-based speech generation. However, there is no systematic understanding on how the codec system affects the speech generation performance of the SLM. In this work, we examine codec tokens within SLM framework for speech generation to provide insights for effective codec design. We retrain existing high-performing neural codec models on the same data set and loss functions to compare their performance in a uniform setting. We integrate codec tokens into two SLM systems: masked-based parallel speech generation system and an auto-regressive (AR) plus non-auto-regressive (NAR) model-based system. Our findings indicate that better speech reconstruction in codec systems does not guarantee improved speech generation in SLM. A high-quality codec decoder is crucial for natural speech production in SLM, while speech intelligibility depends more on quantization mechanism. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.04016v1-abstract-full').style.display = 'none'; document.getElementById('2409.04016v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by SLT-2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.00292">arXiv:2409.00292</a> <span> [<a href="https://arxiv.org/pdf/2409.00292">pdf</a>, <a href="https://arxiv.org/format/2409.00292">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> REFFLY: Melody-Constrained Lyrics Editing Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhao%2C+S">Songyan Zhao</a>, <a href="/search/eess?searchtype=author&query=Li%2C+B">Bingxuan Li</a>, <a href="/search/eess?searchtype=author&query=Tian%2C+Y">Yufei Tian</a>, <a href="/search/eess?searchtype=author&query=Peng%2C+N">Nanyun Peng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.00292v1-abstract-short" style="display: inline;"> Automatic melody-to-lyric generation aims to produce lyrics that align with a given melody. Although previous work can generate lyrics based on high-level control signals, such as keywords or genre, they often struggle with three challenges: (1) lack of controllability, as prior works are only able to produce lyrics from scratch, with little or no control over the content; (2) inability to generat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00292v1-abstract-full').style.display = 'inline'; document.getElementById('2409.00292v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.00292v1-abstract-full" style="display: none;"> Automatic melody-to-lyric generation aims to produce lyrics that align with a given melody. Although previous work can generate lyrics based on high-level control signals, such as keywords or genre, they often struggle with three challenges: (1) lack of controllability, as prior works are only able to produce lyrics from scratch, with little or no control over the content; (2) inability to generate fully structured songs with the desired format; and (3) failure to align prominent words in the lyrics with prominent notes in the melody, resulting in poor lyrics-melody alignment. In this work, we introduce REFFLY (REvision Framework For Lyrics), the first revision framework designed to edit arbitrary forms of plain text draft into high-quality, full-fledged song lyrics. Our approach ensures that the generated lyrics retain the original meaning of the draft, align with the melody, and adhere to the desired song structures. We demonstrate that REFFLY performs well in diverse task settings, such as lyrics revision and song translation. Experimental results show that our model outperforms strong baselines, such as Lyra (Tian et al. 2023) and GPT-4, by 25% in both musicality and text quality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00292v1-abstract-full').style.display = 'none'; document.getElementById('2409.00292v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.12829">arXiv:2408.12829</a> <span> [<a href="https://arxiv.org/pdf/2408.12829">pdf</a>, <a href="https://arxiv.org/format/2408.12829">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Uncertainty-Aware Mean Opinion Score Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+H">Hui Wang</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+S">Shiwan Zhao</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+J">Jiaming Zhou</a>, <a href="/search/eess?searchtype=author&query=Zheng%2C+X">Xiguang Zheng</a>, <a href="/search/eess?searchtype=author&query=Sun%2C+H">Haoqin Sun</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+X">Xuechen Wang</a>, <a href="/search/eess?searchtype=author&query=Qin%2C+Y">Yong Qin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.12829v1-abstract-short" style="display: inline;"> Mean Opinion Score (MOS) prediction has made significant progress in specific domains. However, the unstable performance of MOS prediction models across diverse samples presents ongoing challenges in the practical application of these systems. In this paper, we point out that the absence of uncertainty modeling is a significant limitation hindering MOS prediction systems from applying to the real… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12829v1-abstract-full').style.display = 'inline'; document.getElementById('2408.12829v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.12829v1-abstract-full" style="display: none;"> Mean Opinion Score (MOS) prediction has made significant progress in specific domains. However, the unstable performance of MOS prediction models across diverse samples presents ongoing challenges in the practical application of these systems. In this paper, we point out that the absence of uncertainty modeling is a significant limitation hindering MOS prediction systems from applying to the real and open world. We analyze the sources of uncertainty in the MOS prediction task and propose to establish an uncertainty-aware MOS prediction system that models aleatory uncertainty and epistemic uncertainty by heteroscedastic regression and Monte Carlo dropout separately. The experimental results show that the system captures uncertainty well and is capable of performing selective prediction and out-of-domain detection. Such capabilities significantly enhance the practical utility of MOS systems in diverse real and open-world environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12829v1-abstract-full').style.display = 'none'; document.getElementById('2408.12829v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by Interspeech 2024, oral</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.09278">arXiv:2408.09278</a> <span> [<a href="https://arxiv.org/pdf/2408.09278">pdf</a>, <a href="https://arxiv.org/format/2408.09278">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Cross-Species Data Integration for Enhanced Layer Segmentation in Kidney Pathology </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhu%2C+J">Junchao Zhu</a>, <a href="/search/eess?searchtype=author&query=Yin%2C+M">Mengmeng Yin</a>, <a href="/search/eess?searchtype=author&query=Deng%2C+R">Ruining Deng</a>, <a href="/search/eess?searchtype=author&query=Long%2C+Y">Yitian Long</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yu Wang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yaohong Wang</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+S">Shilin Zhao</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+H">Haichun Yang</a>, <a href="/search/eess?searchtype=author&query=Huo%2C+Y">Yuankai Huo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.09278v1-abstract-short" style="display: inline;"> Accurate delineation of the boundaries between the renal cortex and medulla is crucial for subsequent functional structural analysis and disease diagnosis. Training high-quality deep-learning models for layer segmentation relies on the availability of large amounts of annotated data. However, due to the patient's privacy of medical data and scarce clinical cases, constructing pathological datasets… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.09278v1-abstract-full').style.display = 'inline'; document.getElementById('2408.09278v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.09278v1-abstract-full" style="display: none;"> Accurate delineation of the boundaries between the renal cortex and medulla is crucial for subsequent functional structural analysis and disease diagnosis. Training high-quality deep-learning models for layer segmentation relies on the availability of large amounts of annotated data. However, due to the patient's privacy of medical data and scarce clinical cases, constructing pathological datasets from clinical sources is relatively difficult and expensive. Moreover, using external natural image datasets introduces noise during the domain generalization process. Cross-species homologous data, such as mouse kidney data, which exhibits high structural and feature similarity to human kidneys, has the potential to enhance model performance on human datasets. In this study, we incorporated the collected private Periodic Acid-Schiff (PAS) stained mouse kidney dataset into the human kidney dataset for joint training. The results showed that after introducing cross-species homologous data, the semantic segmentation models based on CNN and Transformer architectures achieved an average increase of 1.77% and 1.24% in mIoU, and 1.76% and 0.89% in Dice score for the human renal cortex and medulla datasets, respectively. This approach is also capable of enhancing the model's generalization ability. This indicates that cross-species homologous data, as a low-noise trainable data source, can help improve model performance under conditions of limited clinical samples. Code is available at https://github.com/hrlblab/layer_segmentation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.09278v1-abstract-full').style.display = 'none'; document.getElementById('2408.09278v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.00325">arXiv:2408.00325</a> <span> [<a href="https://arxiv.org/pdf/2408.00325">pdf</a>, <a href="https://arxiv.org/format/2408.00325">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Iterative Prototype Refinement for Ambiguous Speech Emotion Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Sun%2C+H">Haoqin Sun</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+S">Shiwan Zhao</a>, <a href="/search/eess?searchtype=author&query=Kong%2C+X">Xiangyu Kong</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+X">Xuechen Wang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+H">Hui Wang</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+J">Jiaming Zhou</a>, <a href="/search/eess?searchtype=author&query=Qin%2C+Y">Yong Qin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.00325v1-abstract-short" style="display: inline;"> Recognizing emotions from speech is a daunting task due to the subtlety and ambiguity of expressions. Traditional speech emotion recognition (SER) systems, which typically rely on a singular, precise emotion label, struggle with this complexity. Therefore, modeling the inherent ambiguity of emotions is an urgent problem. In this paper, we propose an iterative prototype refinement framework (IPR) f… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.00325v1-abstract-full').style.display = 'inline'; document.getElementById('2408.00325v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.00325v1-abstract-full" style="display: none;"> Recognizing emotions from speech is a daunting task due to the subtlety and ambiguity of expressions. Traditional speech emotion recognition (SER) systems, which typically rely on a singular, precise emotion label, struggle with this complexity. Therefore, modeling the inherent ambiguity of emotions is an urgent problem. In this paper, we propose an iterative prototype refinement framework (IPR) for ambiguous SER. IPR comprises two interlinked components: contrastive learning and class prototypes. The former provides an efficient way to obtain high-quality representations of ambiguous samples. The latter are dynamically updated based on ambiguous labels -- the similarity of the ambiguous data to all prototypes. These refined embeddings yield precise pseudo labels, thus reinforcing representation quality. Experimental evaluations conducted on the IEMOCAP dataset validate the superior performance of IPR over state-of-the-art methods, thus proving the effectiveness of our proposed method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.00325v1-abstract-full').style.display = 'none'; document.getElementById('2408.00325v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.20469">arXiv:2407.20469</a> <span> [<a href="https://arxiv.org/pdf/2407.20469">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Optics">physics.optics</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Efficient, gigapixel-scale, aberration-free whole slide scanner using angular ptychographic imaging with closed-form solution </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhao%2C+S">Shi Zhao</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+H">Haowen Zhou</a>, <a href="/search/eess?searchtype=author&query=Lin%2C+S">Siyu Lin</a>, <a href="/search/eess?searchtype=author&query=Cao%2C+R">Ruizhi Cao</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+C">Changhuei Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.20469v1-abstract-short" style="display: inline;"> Whole slide imaging provides a wide field-of-view (FOV) across cross-sections of biopsy or surgery samples, significantly facilitating pathological analysis and clinical diagnosis. Such high-quality images that enable detailed visualization of cellular and tissue structures are essential for effective patient care and treatment planning. To obtain such high-quality images for pathology application… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.20469v1-abstract-full').style.display = 'inline'; document.getElementById('2407.20469v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.20469v1-abstract-full" style="display: none;"> Whole slide imaging provides a wide field-of-view (FOV) across cross-sections of biopsy or surgery samples, significantly facilitating pathological analysis and clinical diagnosis. Such high-quality images that enable detailed visualization of cellular and tissue structures are essential for effective patient care and treatment planning. To obtain such high-quality images for pathology applications, there is a need for scanners with high spatial bandwidth products, free from aberrations, and without the requirement for z-scanning. Here we report a whole slide imaging system based on angular ptychographic imaging with a closed-form solution (WSI-APIC), which offers efficient, tens-of-gigapixels, large-FOV, aberration-free imaging. WSI-APIC utilizes oblique incoherent illumination for initial high-level segmentation, thereby bypassing unnecessary scanning of the background regions and enhancing image acquisition efficiency. A GPU-accelerated APIC algorithm analytically reconstructs phase images with effective digital aberration corrections and improved optical resolutions. Moreover, an auto-stitching technique based on scale-invariant feature transform ensures the seamless concatenation of whole slide phase images. In our experiment, WSI-APIC achieved an optical resolution of 772 nm using a 10x/0.25 NA objective lens and captures 80-gigapixel aberration-free phase images for a standard 76.2 mm x 25.4 mm microscopic slide. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.20469v1-abstract-full').style.display = 'none'; document.getElementById('2407.20469v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.18461">arXiv:2407.18461</a> <span> [<a href="https://arxiv.org/pdf/2407.18461">pdf</a>, <a href="https://arxiv.org/format/2407.18461">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.21437/Interspeech.2024-1360">10.21437/Interspeech.2024-1360 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Enhancing Dysarthric Speech Recognition for Unseen Speakers via Prototype-Based Adaptation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+S">Shiyao Wang</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+S">Shiwan Zhao</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+J">Jiaming Zhou</a>, <a href="/search/eess?searchtype=author&query=Kong%2C+A">Aobo Kong</a>, <a href="/search/eess?searchtype=author&query=Qin%2C+Y">Yong Qin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.18461v1-abstract-short" style="display: inline;"> Dysarthric speech recognition (DSR) presents a formidable challenge due to inherent inter-speaker variability, leading to severe performance degradation when applying DSR models to new dysarthric speakers. Traditional speaker adaptation methodologies typically involve fine-tuning models for each speaker, but this strategy is cost-prohibitive and inconvenient for disabled users, requiring substanti… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.18461v1-abstract-full').style.display = 'inline'; document.getElementById('2407.18461v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.18461v1-abstract-full" style="display: none;"> Dysarthric speech recognition (DSR) presents a formidable challenge due to inherent inter-speaker variability, leading to severe performance degradation when applying DSR models to new dysarthric speakers. Traditional speaker adaptation methodologies typically involve fine-tuning models for each speaker, but this strategy is cost-prohibitive and inconvenient for disabled users, requiring substantial data collection. To address this issue, we introduce a prototype-based approach that markedly improves DSR performance for unseen dysarthric speakers without additional fine-tuning. Our method employs a feature extractor trained with HuBERT to produce per-word prototypes that encapsulate the characteristics of previously unseen speakers. These prototypes serve as the basis for classification. Additionally, we incorporate supervised contrastive learning to refine feature extraction. By enhancing representation quality, we further improve DSR performance, enabling effective personalized DSR. We release our code at https://github.com/NKU-HLT/PB-DSR. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.18461v1-abstract-full').style.display = 'none'; document.getElementById('2407.18461v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted by Interspeech 2024</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> INTERSPEECH 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.18390">arXiv:2407.18390</a> <span> [<a href="https://arxiv.org/pdf/2407.18390">pdf</a>, <a href="https://arxiv.org/format/2407.18390">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Adapting Mouse Pathological Model to Human Glomerular Lesion Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yu%2C+L">Lining Yu</a>, <a href="/search/eess?searchtype=author&query=Yin%2C+M">Mengmeng Yin</a>, <a href="/search/eess?searchtype=author&query=Deng%2C+R">Ruining Deng</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+Q">Quan Liu</a>, <a href="/search/eess?searchtype=author&query=Yao%2C+T">Tianyuan Yao</a>, <a href="/search/eess?searchtype=author&query=Cui%2C+C">Can Cui</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yu Wang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yaohong Wang</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+S">Shilin Zhao</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+H">Haichun Yang</a>, <a href="/search/eess?searchtype=author&query=Huo%2C+Y">Yuankai Huo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.18390v1-abstract-short" style="display: inline;"> Moving from animal models to human applications in preclinical research encompasses a broad spectrum of disciplines in medical science. A fundamental element in the development of new drugs, treatments, diagnostic methods, and in deepening our understanding of disease processes is the accurate measurement of kidney tissues. Past studies have demonstrated the viability of translating glomeruli segm… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.18390v1-abstract-full').style.display = 'inline'; document.getElementById('2407.18390v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.18390v1-abstract-full" style="display: none;"> Moving from animal models to human applications in preclinical research encompasses a broad spectrum of disciplines in medical science. A fundamental element in the development of new drugs, treatments, diagnostic methods, and in deepening our understanding of disease processes is the accurate measurement of kidney tissues. Past studies have demonstrated the viability of translating glomeruli segmentation techniques from mouse models to human applications. Yet, these investigations tend to neglect the complexities involved in segmenting pathological glomeruli affected by different lesions. Such lesions present a wider range of morphological variations compared to healthy glomerular tissue, which are arguably more valuable than normal glomeruli in clinical practice. Furthermore, data on lesions from animal models can be more readily scaled up from disease models and whole kidney biopsies. This brings up a question: ``\textit{Can a pathological segmentation model trained on mouse models be effectively applied to human patients?}" To answer this question, we introduced GLAM, a deep learning study for fine-grained segmentation of human kidney lesions using a mouse model, addressing mouse-to-human transfer learning, by evaluating different learning strategies for segmenting human pathological lesions using zero-shot transfer learning and hybrid learning by leveraging mouse samples. From the results, the hybrid learning model achieved superior performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.18390v1-abstract-full').style.display = 'none'; document.getElementById('2407.18390v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.12229">arXiv:2407.12229</a> <span> [<a href="https://arxiv.org/pdf/2407.12229">pdf</a>, <a href="https://arxiv.org/format/2407.12229">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Laugh Now Cry Later: Controlling Time-Varying Emotional States of Flow-Matching-Based Zero-Shot Text-to-Speech </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wu%2C+H">Haibin Wu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+X">Xiaofei Wang</a>, <a href="/search/eess?searchtype=author&query=Eskimez%2C+S+E">Sefik Emre Eskimez</a>, <a href="/search/eess?searchtype=author&query=Thakker%2C+M">Manthan Thakker</a>, <a href="/search/eess?searchtype=author&query=Tompkins%2C+D">Daniel Tompkins</a>, <a href="/search/eess?searchtype=author&query=Tsai%2C+C">Chung-Hsien Tsai</a>, <a href="/search/eess?searchtype=author&query=Li%2C+C">Canrun Li</a>, <a href="/search/eess?searchtype=author&query=Xiao%2C+Z">Zhen Xiao</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+S">Sheng Zhao</a>, <a href="/search/eess?searchtype=author&query=Li%2C+J">Jinyu Li</a>, <a href="/search/eess?searchtype=author&query=Kanda%2C+N">Naoyuki Kanda</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.12229v2-abstract-short" style="display: inline;"> People change their tones of voice, often accompanied by nonverbal vocalizations (NVs) such as laughter and cries, to convey rich emotions. However, most text-to-speech (TTS) systems lack the capability to generate speech with rich emotions, including NVs. This paper introduces EmoCtrl-TTS, an emotion-controllable zero-shot TTS that can generate highly emotional speech with NVs for any speaker. Em… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.12229v2-abstract-full').style.display = 'inline'; document.getElementById('2407.12229v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.12229v2-abstract-full" style="display: none;"> People change their tones of voice, often accompanied by nonverbal vocalizations (NVs) such as laughter and cries, to convey rich emotions. However, most text-to-speech (TTS) systems lack the capability to generate speech with rich emotions, including NVs. This paper introduces EmoCtrl-TTS, an emotion-controllable zero-shot TTS that can generate highly emotional speech with NVs for any speaker. EmoCtrl-TTS leverages arousal and valence values, as well as laughter embeddings, to condition the flow-matching-based zero-shot TTS. To achieve high-quality emotional speech generation, EmoCtrl-TTS is trained using more than 27,000 hours of expressive data curated based on pseudo-labeling. Comprehensive evaluations demonstrate that EmoCtrl-TTS excels in mimicking the emotions of audio prompts in speech-to-speech translation scenarios. We also show that EmoCtrl-TTS can capture emotion changes, express strong emotions, and generate various NVs in zero-shot TTS. See https://aka.ms/emoctrl-tts for demo samples. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.12229v2-abstract-full').style.display = 'none'; document.getElementById('2407.12229v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by SLT2024. See https://aka.ms/emoctrl-tts for demo samples</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.10833">arXiv:2407.10833</a> <span> [<a href="https://arxiv.org/pdf/2407.10833">pdf</a>, <a href="https://arxiv.org/format/2407.10833">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MoE-DiffIR: Task-customized Diffusion Priors for Universal Compressed Image Restoration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ren%2C+Y">Yulin Ren</a>, <a href="/search/eess?searchtype=author&query=Li%2C+X">Xin Li</a>, <a href="/search/eess?searchtype=author&query=Li%2C+B">Bingchen Li</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+X">Xingrui Wang</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+M">Mengxi Guo</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+S">Shijie Zhao</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+L">Li Zhang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Z">Zhibo Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.10833v1-abstract-short" style="display: inline;"> We present MoE-DiffIR, an innovative universal compressed image restoration (CIR) method with task-customized diffusion priors. This intends to handle two pivotal challenges in the existing CIR methods: (i) lacking adaptability and universality for different image codecs, e.g., JPEG and WebP; (ii) poor texture generation capability, particularly at low bitrates. Specifically, our MoE-DiffIR develo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.10833v1-abstract-full').style.display = 'inline'; document.getElementById('2407.10833v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.10833v1-abstract-full" style="display: none;"> We present MoE-DiffIR, an innovative universal compressed image restoration (CIR) method with task-customized diffusion priors. This intends to handle two pivotal challenges in the existing CIR methods: (i) lacking adaptability and universality for different image codecs, e.g., JPEG and WebP; (ii) poor texture generation capability, particularly at low bitrates. Specifically, our MoE-DiffIR develops the powerful mixture-of-experts (MoE) prompt module, where some basic prompts cooperate to excavate the task-customized diffusion priors from Stable Diffusion (SD) for each compression task. Moreover, the degradation-aware routing mechanism is proposed to enable the flexible assignment of basic prompts. To activate and reuse the cross-modality generation prior of SD, we design the visual-to-text adapter for MoE-DiffIR, which aims to adapt the embedding of low-quality images from the visual domain to the textual domain as the textual guidance for SD, enabling more consistent and reasonable texture generation. We also construct one comprehensive benchmark dataset for universal CIR, covering 21 types of degradations from 7 popular traditional and learned codecs. Extensive experiments on universal CIR have demonstrated the excellent robustness and texture restoration capability of our proposed MoE-DiffIR. The project can be found at https://renyulin-f.github.io/MoE-DiffIR.github.io/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.10833v1-abstract-full').style.display = 'none'; document.getElementById('2407.10833v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ECCV 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.09029">arXiv:2407.09029</a> <span> [<a href="https://arxiv.org/pdf/2407.09029">pdf</a>, <a href="https://arxiv.org/format/2407.09029">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Emotion Recognition in Incomplete Data: A Novel Cross-Modal Alignment, Reconstruction, and Refinement Framework </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Sun%2C+H">Haoqin Sun</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+S">Shiwan Zhao</a>, <a href="/search/eess?searchtype=author&query=Li%2C+S">Shaokai Li</a>, <a href="/search/eess?searchtype=author&query=Kong%2C+X">Xiangyu Kong</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+X">Xuechen Wang</a>, <a href="/search/eess?searchtype=author&query=Kong%2C+A">Aobo Kong</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+J">Jiaming Zhou</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yong Chen</a>, <a href="/search/eess?searchtype=author&query=Zeng%2C+W">Wenjia Zeng</a>, <a href="/search/eess?searchtype=author&query=Qin%2C+Y">Yong Qin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.09029v1-abstract-short" style="display: inline;"> Multimodal emotion recognition systems rely heavily on the full availability of modalities, suffering significant performance declines when modal data is incomplete. To tackle this issue, we present the Cross-Modal Alignment, Reconstruction, and Refinement (CM-ARR) framework, an innovative approach that sequentially engages in cross-modal alignment, reconstruction, and refinement phases to handle… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.09029v1-abstract-full').style.display = 'inline'; document.getElementById('2407.09029v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.09029v1-abstract-full" style="display: none;"> Multimodal emotion recognition systems rely heavily on the full availability of modalities, suffering significant performance declines when modal data is incomplete. To tackle this issue, we present the Cross-Modal Alignment, Reconstruction, and Refinement (CM-ARR) framework, an innovative approach that sequentially engages in cross-modal alignment, reconstruction, and refinement phases to handle missing modalities and enhance emotion recognition. This framework utilizes unsupervised distribution-based contrastive learning to align heterogeneous modal distributions, reducing discrepancies and modeling semantic uncertainty effectively. The reconstruction phase applies normalizing flow models to transform these aligned distributions and recover missing modalities. The refinement phase employs supervised point-based contrastive learning to disrupt semantic correlations and accentuate emotional traits, thereby enriching the affective content of the reconstructed representations. Extensive experiments on the IEMOCAP and MSP-IMPROV datasets confirm the superior performance of CM-ARR under conditions of both missing and complete modalities. Notably, averaged across six scenarios of missing modalities, CM-ARR achieves absolute improvements of 2.11% in WAR and 2.12% in UAR on the IEMOCAP dataset, and 1.71% and 1.96% in WAR and UAR, respectively, on the MSP-IMPROV dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.09029v1-abstract-full').style.display = 'none'; document.getElementById('2407.09029v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.08551">arXiv:2407.08551</a> <span> [<a href="https://arxiv.org/pdf/2407.08551">pdf</a>, <a href="https://arxiv.org/format/2407.08551">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Autoregressive Speech Synthesis without Vector Quantization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Meng%2C+L">Lingwei Meng</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+L">Long Zhou</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+S">Shujie Liu</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+S">Sanyuan Chen</a>, <a href="/search/eess?searchtype=author&query=Han%2C+B">Bing Han</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+S">Shujie Hu</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+Y">Yanqing Liu</a>, <a href="/search/eess?searchtype=author&query=Li%2C+J">Jinyu Li</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+S">Sheng Zhao</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+X">Xixin Wu</a>, <a href="/search/eess?searchtype=author&query=Meng%2C+H">Helen Meng</a>, <a href="/search/eess?searchtype=author&query=Wei%2C+F">Furu Wei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.08551v1-abstract-short" style="display: inline;"> We present MELLE, a novel continuous-valued tokens based language modeling approach for text to speech synthesis (TTS). MELLE autoregressively generates continuous mel-spectrogram frames directly from text condition, bypassing the need for vector quantization, which are originally designed for audio compression and sacrifice fidelity compared to mel-spectrograms. Specifically, (i) instead of cross… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.08551v1-abstract-full').style.display = 'inline'; document.getElementById('2407.08551v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.08551v1-abstract-full" style="display: none;"> We present MELLE, a novel continuous-valued tokens based language modeling approach for text to speech synthesis (TTS). MELLE autoregressively generates continuous mel-spectrogram frames directly from text condition, bypassing the need for vector quantization, which are originally designed for audio compression and sacrifice fidelity compared to mel-spectrograms. Specifically, (i) instead of cross-entropy loss, we apply regression loss with a proposed spectrogram flux loss function to model the probability distribution of the continuous-valued tokens. (ii) we have incorporated variational inference into MELLE to facilitate sampling mechanisms, thereby enhancing the output diversity and model robustness. Experiments demonstrate that, compared to the two-stage codec language models VALL-E and its variants, the single-stage MELLE mitigates robustness issues by avoiding the inherent flaws of sampling discrete codes, achieves superior performance across multiple metrics, and, most importantly, offers a more streamlined paradigm. See https://aka.ms/melle for demos of our work. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.08551v1-abstract-full').style.display = 'none'; document.getElementById('2407.08551v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.05758">arXiv:2407.05758</a> <span> [<a href="https://arxiv.org/pdf/2407.05758">pdf</a>, <a href="https://arxiv.org/format/2407.05758">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Potential of Multimodal Large Language Models for Data Mining of Medical Images and Free-text Reports </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+Y">Yutong Zhang</a>, <a href="/search/eess?searchtype=author&query=Pan%2C+Y">Yi Pan</a>, <a href="/search/eess?searchtype=author&query=Zhong%2C+T">Tianyang Zhong</a>, <a href="/search/eess?searchtype=author&query=Dong%2C+P">Peixin Dong</a>, <a href="/search/eess?searchtype=author&query=Xie%2C+K">Kangni Xie</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+Y">Yuxiao Liu</a>, <a href="/search/eess?searchtype=author&query=Jiang%2C+H">Hanqi Jiang</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+Z">Zhengliang Liu</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+S">Shijie Zhao</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+T">Tuo Zhang</a>, <a href="/search/eess?searchtype=author&query=Jiang%2C+X">Xi Jiang</a>, <a href="/search/eess?searchtype=author&query=Shen%2C+D">Dinggang Shen</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+T">Tianming Liu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+X">Xin Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.05758v1-abstract-short" style="display: inline;"> Medical images and radiology reports are crucial for diagnosing medical conditions, highlighting the importance of quantitative analysis for clinical decision-making. However, the diversity and cross-source heterogeneity of these data challenge the generalizability of current data-mining methods. Multimodal large language models (MLLMs) have recently transformed many domains, significantly affecti… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.05758v1-abstract-full').style.display = 'inline'; document.getElementById('2407.05758v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.05758v1-abstract-full" style="display: none;"> Medical images and radiology reports are crucial for diagnosing medical conditions, highlighting the importance of quantitative analysis for clinical decision-making. However, the diversity and cross-source heterogeneity of these data challenge the generalizability of current data-mining methods. Multimodal large language models (MLLMs) have recently transformed many domains, significantly affecting the medical field. Notably, Gemini-Vision-series (Gemini) and GPT-4-series (GPT-4) models have epitomized a paradigm shift in Artificial General Intelligence (AGI) for computer vision, showcasing their potential in the biomedical domain. In this study, we evaluated the performance of the Gemini, GPT-4, and 4 popular large models for an exhaustive evaluation across 14 medical imaging datasets, including 5 medical imaging categories (dermatology, radiology, dentistry, ophthalmology, and endoscopy), and 3 radiology report datasets. The investigated tasks encompass disease classification, lesion segmentation, anatomical localization, disease diagnosis, report generation, and lesion detection. Our experimental results demonstrated that Gemini-series models excelled in report generation and lesion detection but faces challenges in disease classification and anatomical localization. Conversely, GPT-series models exhibited proficiency in lesion segmentation and anatomical localization but encountered difficulties in disease diagnosis and lesion detection. Additionally, both the Gemini series and GPT series contain models that have demonstrated commendable generation efficiency. While both models hold promise in reducing physician workload, alleviating pressure on limited healthcare resources, and fostering collaboration between clinical practitioners and artificial intelligence technologies, substantial enhancements and comprehensive validations remain imperative before clinical deployment. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.05758v1-abstract-full').style.display = 'none'; document.getElementById('2407.05758v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.03245">arXiv:2407.03245</a> <span> [<a href="https://arxiv.org/pdf/2407.03245">pdf</a>, <a href="https://arxiv.org/format/2407.03245">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> TieBot: Learning to Knot a Tie from Visual Demonstration through a Real-to-Sim-to-Real Approach </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Peng%2C+W">Weikun Peng</a>, <a href="/search/eess?searchtype=author&query=Lv%2C+J">Jun Lv</a>, <a href="/search/eess?searchtype=author&query=Zeng%2C+Y">Yuwei Zeng</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+H">Haonan Chen</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+S">Siheng Zhao</a>, <a href="/search/eess?searchtype=author&query=Sun%2C+J">Jichen Sun</a>, <a href="/search/eess?searchtype=author&query=Lu%2C+C">Cewu Lu</a>, <a href="/search/eess?searchtype=author&query=Shao%2C+L">Lin Shao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.03245v3-abstract-short" style="display: inline;"> The tie-knotting task is highly challenging due to the tie's high deformation and long-horizon manipulation actions. This work presents TieBot, a Real-to-Sim-to-Real learning from visual demonstration system for the robots to learn to knot a tie. We introduce the Hierarchical Feature Matching approach to estimate a sequence of tie's meshes from the demonstration video. With these estimated meshes… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.03245v3-abstract-full').style.display = 'inline'; document.getElementById('2407.03245v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.03245v3-abstract-full" style="display: none;"> The tie-knotting task is highly challenging due to the tie's high deformation and long-horizon manipulation actions. This work presents TieBot, a Real-to-Sim-to-Real learning from visual demonstration system for the robots to learn to knot a tie. We introduce the Hierarchical Feature Matching approach to estimate a sequence of tie's meshes from the demonstration video. With these estimated meshes used as subgoals, we first learn a teacher policy using privileged information. Then, we learn a student policy with point cloud observation by imitating teacher policy. Lastly, our pipeline applies learned policy to real-world execution. We demonstrate the effectiveness of TieBot in simulation and the real world. In the real-world experiment, a dual-arm robot successfully knots a tie, achieving 50% success rate among 10 trials. Videos can be found https://tiebots.github.io/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.03245v3-abstract-full').style.display = 'none'; document.getElementById('2407.03245v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by CoRL 2024 as Oral presentation, camera-ready version</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.00596">arXiv:2407.00596</a> <span> [<a href="https://arxiv.org/pdf/2407.00596">pdf</a>, <a href="https://arxiv.org/format/2407.00596">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> HATs: Hierarchical Adaptive Taxonomy Segmentation for Panoramic Pathology Image Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Deng%2C+R">Ruining Deng</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+Q">Quan Liu</a>, <a href="/search/eess?searchtype=author&query=Cui%2C+C">Can Cui</a>, <a href="/search/eess?searchtype=author&query=Yao%2C+T">Tianyuan Yao</a>, <a href="/search/eess?searchtype=author&query=Xiong%2C+J">Juming Xiong</a>, <a href="/search/eess?searchtype=author&query=Bao%2C+S">Shunxing Bao</a>, <a href="/search/eess?searchtype=author&query=Li%2C+H">Hao Li</a>, <a href="/search/eess?searchtype=author&query=Yin%2C+M">Mengmeng Yin</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yu Wang</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+S">Shilin Zhao</a>, <a href="/search/eess?searchtype=author&query=Tang%2C+Y">Yucheng Tang</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+H">Haichun Yang</a>, <a href="/search/eess?searchtype=author&query=Huo%2C+Y">Yuankai Huo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.00596v1-abstract-short" style="display: inline;"> Panoramic image segmentation in computational pathology presents a remarkable challenge due to the morphologically complex and variably scaled anatomy. For instance, the intricate organization in kidney pathology spans multiple layers, from regions like the cortex and medulla to functional units such as glomeruli, tubules, and vessels, down to various cell types. In this paper, we propose a novel… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.00596v1-abstract-full').style.display = 'inline'; document.getElementById('2407.00596v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.00596v1-abstract-full" style="display: none;"> Panoramic image segmentation in computational pathology presents a remarkable challenge due to the morphologically complex and variably scaled anatomy. For instance, the intricate organization in kidney pathology spans multiple layers, from regions like the cortex and medulla to functional units such as glomeruli, tubules, and vessels, down to various cell types. In this paper, we propose a novel Hierarchical Adaptive Taxonomy Segmentation (HATs) method, which is designed to thoroughly segment panoramic views of kidney structures by leveraging detailed anatomical insights. Our approach entails (1) the innovative HATs technique which translates spatial relationships among 15 distinct object classes into a versatile "plug-and-play" loss function that spans across regions, functional units, and cells, (2) the incorporation of anatomical hierarchies and scale considerations into a unified simple matrix representation for all panoramic entities, (3) the adoption of the latest AI foundation model (EfficientSAM) as a feature extraction tool to boost the model's adaptability, yet eliminating the need for manual prompt generation in conventional segment anything model (SAM). Experimental findings demonstrate that the HATs method offers an efficient and effective strategy for integrating clinical insights and imaging precedents into a unified segmentation model across more than 15 categories. The official implementation is publicly available at https://github.com/hrlblab/HATs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.00596v1-abstract-full').style.display = 'none'; document.getElementById('2407.00596v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">arXiv admin note: text overlap with arXiv:2402.19286</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.18009">arXiv:2406.18009</a> <span> [<a href="https://arxiv.org/pdf/2406.18009">pdf</a>, <a href="https://arxiv.org/format/2406.18009">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> E2 TTS: Embarrassingly Easy Fully Non-Autoregressive Zero-Shot TTS </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Eskimez%2C+S+E">Sefik Emre Eskimez</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+X">Xiaofei Wang</a>, <a href="/search/eess?searchtype=author&query=Thakker%2C+M">Manthan Thakker</a>, <a href="/search/eess?searchtype=author&query=Li%2C+C">Canrun Li</a>, <a href="/search/eess?searchtype=author&query=Tsai%2C+C">Chung-Hsien Tsai</a>, <a href="/search/eess?searchtype=author&query=Xiao%2C+Z">Zhen Xiao</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+H">Hemin Yang</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+Z">Zirun Zhu</a>, <a href="/search/eess?searchtype=author&query=Tang%2C+M">Min Tang</a>, <a href="/search/eess?searchtype=author&query=Tan%2C+X">Xu Tan</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+Y">Yanqing Liu</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+S">Sheng Zhao</a>, <a href="/search/eess?searchtype=author&query=Kanda%2C+N">Naoyuki Kanda</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.18009v2-abstract-short" style="display: inline;"> This paper introduces Embarrassingly Easy Text-to-Speech (E2 TTS), a fully non-autoregressive zero-shot text-to-speech system that offers human-level naturalness and state-of-the-art speaker similarity and intelligibility. In the E2 TTS framework, the text input is converted into a character sequence with filler tokens. The flow-matching-based mel spectrogram generator is then trained based on the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.18009v2-abstract-full').style.display = 'inline'; document.getElementById('2406.18009v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.18009v2-abstract-full" style="display: none;"> This paper introduces Embarrassingly Easy Text-to-Speech (E2 TTS), a fully non-autoregressive zero-shot text-to-speech system that offers human-level naturalness and state-of-the-art speaker similarity and intelligibility. In the E2 TTS framework, the text input is converted into a character sequence with filler tokens. The flow-matching-based mel spectrogram generator is then trained based on the audio infilling task. Unlike many previous works, it does not require additional components (e.g., duration model, grapheme-to-phoneme) or complex techniques (e.g., monotonic alignment search). Despite its simplicity, E2 TTS achieves state-of-the-art zero-shot TTS capabilities that are comparable to or surpass previous works, including Voicebox and NaturalSpeech 3. The simplicity of E2 TTS also allows for flexibility in the input representation. We propose several variants of E2 TTS to improve usability during inference. See https://aka.ms/e2tts/ for demo samples. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.18009v2-abstract-full').style.display = 'none'; document.getElementById('2406.18009v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to SLT 2024. Added evaluation data, see https://github.com/microsoft/e2tts-test-suite for more details</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.12434">arXiv:2406.12434</a> <span> [<a href="https://arxiv.org/pdf/2406.12434">pdf</a>, <a href="https://arxiv.org/format/2406.12434">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Towards Audio Codec-based Speech Separation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yip%2C+J+Q">Jia Qi Yip</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+S">Shengkui Zhao</a>, <a href="/search/eess?searchtype=author&query=Ng%2C+D">Dianwen Ng</a>, <a href="/search/eess?searchtype=author&query=Chng%2C+E+S">Eng Siong Chng</a>, <a href="/search/eess?searchtype=author&query=Ma%2C+B">Bin Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.12434v2-abstract-short" style="display: inline;"> Recent improvements in neural audio codec (NAC) models have generated interest in adopting pre-trained codecs for a variety of speech processing applications to take advantage of the efficiencies gained from high compression, but these have yet been applied to the speech separation (SS) task. SS can benefit from high compression because the compute required for traditional SS models makes them imp… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.12434v2-abstract-full').style.display = 'inline'; document.getElementById('2406.12434v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.12434v2-abstract-full" style="display: none;"> Recent improvements in neural audio codec (NAC) models have generated interest in adopting pre-trained codecs for a variety of speech processing applications to take advantage of the efficiencies gained from high compression, but these have yet been applied to the speech separation (SS) task. SS can benefit from high compression because the compute required for traditional SS models makes them impractical for many edge computing use cases. However, SS is a waveform-masking task where compression tends to introduce distortions that severely impact performance. Here we propose a novel task of Audio Codec-based SS, where SS is performed within the embedding space of a NAC, and propose a new model, Codecformer, to address this task. At inference, Codecformer achieves a 52x reduction in MAC while producing separation performance comparable to a cloud deployment of Sepformer. This method charts a new direction for performing efficient SS in practical scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.12434v2-abstract-full').style.display = 'none'; document.getElementById('2406.12434v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper was accepted by Interspeech 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.07855">arXiv:2406.07855</a> <span> [<a href="https://arxiv.org/pdf/2406.07855">pdf</a>, <a href="https://arxiv.org/format/2406.07855">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> VALL-E R: Robust and Efficient Zero-Shot Text-to-Speech Synthesis via Monotonic Alignment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Han%2C+B">Bing Han</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+L">Long Zhou</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+S">Shujie Liu</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+S">Sanyuan Chen</a>, <a href="/search/eess?searchtype=author&query=Meng%2C+L">Lingwei Meng</a>, <a href="/search/eess?searchtype=author&query=Qian%2C+Y">Yanming Qian</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+Y">Yanqing Liu</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+S">Sheng Zhao</a>, <a href="/search/eess?searchtype=author&query=Li%2C+J">Jinyu Li</a>, <a href="/search/eess?searchtype=author&query=Wei%2C+F">Furu Wei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.07855v1-abstract-short" style="display: inline;"> With the help of discrete neural audio codecs, large language models (LLM) have increasingly been recognized as a promising methodology for zero-shot Text-to-Speech (TTS) synthesis. However, sampling based decoding strategies bring astonishing diversity to generation, but also pose robustness issues such as typos, omissions and repetition. In addition, the high sampling rate of audio also brings h… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.07855v1-abstract-full').style.display = 'inline'; document.getElementById('2406.07855v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.07855v1-abstract-full" style="display: none;"> With the help of discrete neural audio codecs, large language models (LLM) have increasingly been recognized as a promising methodology for zero-shot Text-to-Speech (TTS) synthesis. However, sampling based decoding strategies bring astonishing diversity to generation, but also pose robustness issues such as typos, omissions and repetition. In addition, the high sampling rate of audio also brings huge computational overhead to the inference process of autoregression. To address these issues, we propose VALL-E R, a robust and efficient zero-shot TTS system, building upon the foundation of VALL-E. Specifically, we introduce a phoneme monotonic alignment strategy to strengthen the connection between phonemes and acoustic sequence, ensuring a more precise alignment by constraining the acoustic tokens to match their associated phonemes. Furthermore, we employ a codec-merging approach to downsample the discrete codes in shallow quantization layer, thereby accelerating the decoding speed while preserving the high quality of speech output. Benefiting from these strategies, VALL-E R obtains controllablity over phonemes and demonstrates its strong robustness by approaching the WER of ground truth. In addition, it requires fewer autoregressive steps, with over 60% time reduction during inference. This research has the potential to be applied to meaningful projects, including the creation of speech for those affected by aphasia. Audio samples will be available at: https://aka.ms/valler. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.07855v1-abstract-full').style.display = 'none'; document.getElementById('2406.07855v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.05699">arXiv:2406.05699</a> <span> [<a href="https://arxiv.org/pdf/2406.05699">pdf</a>, <a href="https://arxiv.org/ps/2406.05699">ps</a>, <a href="https://arxiv.org/format/2406.05699">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> An Investigation of Noise Robustness for Flow-Matching-Based Zero-Shot TTS </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+X">Xiaofei Wang</a>, <a href="/search/eess?searchtype=author&query=Eskimez%2C+S+E">Sefik Emre Eskimez</a>, <a href="/search/eess?searchtype=author&query=Thakker%2C+M">Manthan Thakker</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+H">Hemin Yang</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+Z">Zirun Zhu</a>, <a href="/search/eess?searchtype=author&query=Tang%2C+M">Min Tang</a>, <a href="/search/eess?searchtype=author&query=Xia%2C+Y">Yufei Xia</a>, <a href="/search/eess?searchtype=author&query=Li%2C+J">Jinzhu Li</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+S">Sheng Zhao</a>, <a href="/search/eess?searchtype=author&query=Li%2C+J">Jinyu Li</a>, <a href="/search/eess?searchtype=author&query=Kanda%2C+N">Naoyuki Kanda</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.05699v1-abstract-short" style="display: inline;"> Recently, zero-shot text-to-speech (TTS) systems, capable of synthesizing any speaker's voice from a short audio prompt, have made rapid advancements. However, the quality of the generated speech significantly deteriorates when the audio prompt contains noise, and limited research has been conducted to address this issue. In this paper, we explored various strategies to enhance the quality of audi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.05699v1-abstract-full').style.display = 'inline'; document.getElementById('2406.05699v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.05699v1-abstract-full" style="display: none;"> Recently, zero-shot text-to-speech (TTS) systems, capable of synthesizing any speaker's voice from a short audio prompt, have made rapid advancements. However, the quality of the generated speech significantly deteriorates when the audio prompt contains noise, and limited research has been conducted to address this issue. In this paper, we explored various strategies to enhance the quality of audio generated from noisy audio prompts within the context of flow-matching-based zero-shot TTS. Our investigation includes comprehensive training strategies: unsupervised pre-training with masked speech denoising, multi-speaker detection and DNSMOS-based data filtering on the pre-training data, and fine-tuning with random noise mixing. The results of our experiments demonstrate significant improvements in intelligibility, speaker similarity, and overall audio quality compared to the approach of applying speech enhancement to the audio prompt. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.05699v1-abstract-full').style.display = 'none'; document.getElementById('2406.05699v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to INTERSPEECH2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.05370">arXiv:2406.05370</a> <span> [<a href="https://arxiv.org/pdf/2406.05370">pdf</a>, <a href="https://arxiv.org/format/2406.05370">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> VALL-E 2: Neural Codec Language Models are Human Parity Zero-Shot Text to Speech Synthesizers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Chen%2C+S">Sanyuan Chen</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+S">Shujie Liu</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+L">Long Zhou</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+Y">Yanqing Liu</a>, <a href="/search/eess?searchtype=author&query=Tan%2C+X">Xu Tan</a>, <a href="/search/eess?searchtype=author&query=Li%2C+J">Jinyu Li</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+S">Sheng Zhao</a>, <a href="/search/eess?searchtype=author&query=Qian%2C+Y">Yao Qian</a>, <a href="/search/eess?searchtype=author&query=Wei%2C+F">Furu Wei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.05370v2-abstract-short" style="display: inline;"> This paper introduces VALL-E 2, the latest advancement in neural codec language models that marks a milestone in zero-shot text-to-speech synthesis (TTS), achieving human parity for the first time. Based on its predecessor, VALL-E, the new iteration introduces two significant enhancements: Repetition Aware Sampling refines the original nucleus sampling process by accounting for token repetition in… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.05370v2-abstract-full').style.display = 'inline'; document.getElementById('2406.05370v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.05370v2-abstract-full" style="display: none;"> This paper introduces VALL-E 2, the latest advancement in neural codec language models that marks a milestone in zero-shot text-to-speech synthesis (TTS), achieving human parity for the first time. Based on its predecessor, VALL-E, the new iteration introduces two significant enhancements: Repetition Aware Sampling refines the original nucleus sampling process by accounting for token repetition in the decoding history. It not only stabilizes the decoding but also circumvents the infinite loop issue. Grouped Code Modeling organizes codec codes into groups to effectively shorten the sequence length, which not only boosts inference speed but also addresses the challenges of long sequence modeling. Our experiments on the LibriSpeech and VCTK datasets show that VALL-E 2 surpasses previous systems in speech robustness, naturalness, and speaker similarity. It is the first of its kind to reach human parity on these benchmarks. Moreover, VALL-E 2 consistently synthesizes high-quality speech, even for sentences that are traditionally challenging due to their complexity or repetitive phrases. The advantages of this work could contribute to valuable endeavors, such as generating speech for individuals with aphasia or people with amyotrophic lateral sclerosis. See https://aka.ms/valle2 for demos of VALL-E 2. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.05370v2-abstract-full').style.display = 'none'; document.getElementById('2406.05370v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Demo posted</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.04633">arXiv:2406.04633</a> <span> [<a href="https://arxiv.org/pdf/2406.04633">pdf</a>, <a href="https://arxiv.org/ps/2406.04633">ps</a>, <a href="https://arxiv.org/format/2406.04633">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Boosting Diffusion Model for Spectrogram Up-sampling in Text-to-speech: An Empirical Study </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+C">Chong Zhang</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+Y">Yanqing Liu</a>, <a href="/search/eess?searchtype=author&query=Zheng%2C+Y">Yang Zheng</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+S">Sheng Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.04633v1-abstract-short" style="display: inline;"> Scaling text-to-speech (TTS) with autoregressive language model (LM) to large-scale datasets by quantizing waveform into discrete speech tokens is making great progress to capture the diversity and expressiveness in human speech, but the speech reconstruction quality from discrete speech token is far from satisfaction depending on the compressed speech token compression ratio. Generative diffusion… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.04633v1-abstract-full').style.display = 'inline'; document.getElementById('2406.04633v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.04633v1-abstract-full" style="display: none;"> Scaling text-to-speech (TTS) with autoregressive language model (LM) to large-scale datasets by quantizing waveform into discrete speech tokens is making great progress to capture the diversity and expressiveness in human speech, but the speech reconstruction quality from discrete speech token is far from satisfaction depending on the compressed speech token compression ratio. Generative diffusion models trained with score-matching loss and continuous normalized flow trained with flow-matching loss have become prominent in generation of images as well as speech. LM based TTS systems usually quantize speech into discrete tokens and generate these tokens autoregressively, and finally use a diffusion model to up sample coarse-grained speech tokens into fine-grained codec features or mel-spectrograms before reconstructing into waveforms with vocoder, which has a high latency and is not realistic for real time speech applications. In this paper, we systematically investigate varied diffusion models for up sampling stage, which is the main bottleneck for streaming synthesis of LM and diffusion-based architecture, we present the model architecture, objective and subjective metrics to show quality and efficiency improvement. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.04633v1-abstract-full').style.display = 'none'; document.getElementById('2406.04633v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.04281">arXiv:2406.04281</a> <span> [<a href="https://arxiv.org/pdf/2406.04281">pdf</a>, <a href="https://arxiv.org/format/2406.04281">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Total-Duration-Aware Duration Modeling for Text-to-Speech Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Eskimez%2C+S+E">Sefik Emre Eskimez</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+X">Xiaofei Wang</a>, <a href="/search/eess?searchtype=author&query=Thakker%2C+M">Manthan Thakker</a>, <a href="/search/eess?searchtype=author&query=Tsai%2C+C">Chung-Hsien Tsai</a>, <a href="/search/eess?searchtype=author&query=Li%2C+C">Canrun Li</a>, <a href="/search/eess?searchtype=author&query=Xiao%2C+Z">Zhen Xiao</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+H">Hemin Yang</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+Z">Zirun Zhu</a>, <a href="/search/eess?searchtype=author&query=Tang%2C+M">Min Tang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+J">Jinyu Li</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+S">Sheng Zhao</a>, <a href="/search/eess?searchtype=author&query=Kanda%2C+N">Naoyuki Kanda</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.04281v1-abstract-short" style="display: inline;"> Accurate control of the total duration of generated speech by adjusting the speech rate is crucial for various text-to-speech (TTS) applications. However, the impact of adjusting the speech rate on speech quality, such as intelligibility and speaker characteristics, has been underexplored. In this work, we propose a novel total-duration-aware (TDA) duration model for TTS, where phoneme durations a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.04281v1-abstract-full').style.display = 'inline'; document.getElementById('2406.04281v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.04281v1-abstract-full" style="display: none;"> Accurate control of the total duration of generated speech by adjusting the speech rate is crucial for various text-to-speech (TTS) applications. However, the impact of adjusting the speech rate on speech quality, such as intelligibility and speaker characteristics, has been underexplored. In this work, we propose a novel total-duration-aware (TDA) duration model for TTS, where phoneme durations are predicted not only from the text input but also from an additional input of the total target duration. We also propose a MaskGIT-based duration model that enhances the diversity and quality of the predicted phoneme durations. Our results demonstrate that the proposed TDA duration models achieve better intelligibility and speaker similarity for various speech rate configurations compared to the baseline models. We also show that the proposed MaskGIT-based model can generate phoneme durations with higher quality and diversity compared to its regression or flow-matching counterparts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.04281v1-abstract-full').style.display = 'none'; document.getElementById('2406.04281v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to Interspeech 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.03814">arXiv:2406.03814</a> <span> [<a href="https://arxiv.org/pdf/2406.03814">pdf</a>, <a href="https://arxiv.org/format/2406.03814">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Improving Zero-Shot Chinese-English Code-Switching ASR with kNN-CTC and Gated Monolingual Datastores </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhou%2C+J">Jiaming Zhou</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+S">Shiwan Zhao</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+H">Hui Wang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+T">Tian-Hao Zhang</a>, <a href="/search/eess?searchtype=author&query=Sun%2C+H">Haoqin Sun</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+X">Xuechen Wang</a>, <a href="/search/eess?searchtype=author&query=Qin%2C+Y">Yong Qin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.03814v2-abstract-short" style="display: inline;"> The kNN-CTC model has proven to be effective for monolingual automatic speech recognition (ASR). However, its direct application to multilingual scenarios like code-switching, presents challenges. Although there is potential for performance improvement, a kNN-CTC model utilizing a single bilingual datastore can inadvertently introduce undesirable noise from the alternative language. To address thi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.03814v2-abstract-full').style.display = 'inline'; document.getElementById('2406.03814v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.03814v2-abstract-full" style="display: none;"> The kNN-CTC model has proven to be effective for monolingual automatic speech recognition (ASR). However, its direct application to multilingual scenarios like code-switching, presents challenges. Although there is potential for performance improvement, a kNN-CTC model utilizing a single bilingual datastore can inadvertently introduce undesirable noise from the alternative language. To address this, we propose a novel kNN-CTC-based code-switching ASR (CS-ASR) framework that employs dual monolingual datastores and a gated datastore selection mechanism to reduce noise interference. Our method selects the appropriate datastore for decoding each frame, ensuring the injection of language-specific information into the ASR process. We apply this framework to cutting-edge CTC-based models, developing an advanced CS-ASR system. Extensive experiments demonstrate the remarkable effectiveness of our gated datastore mechanism in enhancing the performance of zero-shot Chinese-English CS-ASR. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.03814v2-abstract-full').style.display = 'none'; document.getElementById('2406.03814v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.02653">arXiv:2406.02653</a> <span> [<a href="https://arxiv.org/pdf/2406.02653">pdf</a>, <a href="https://arxiv.org/format/2406.02653">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Pancreatic Tumor Segmentation as Anomaly Detection in CT Images Using Denoising Diffusion Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Babaei%2C+R">Reza Babaei</a>, <a href="/search/eess?searchtype=author&query=Cheng%2C+S">Samuel Cheng</a>, <a href="/search/eess?searchtype=author&query=Thai%2C+T">Theresa Thai</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+S">Shangqing Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.02653v1-abstract-short" style="display: inline;"> Despite the advances in medicine, cancer has remained a formidable challenge. Particularly in the case of pancreatic tumors, characterized by their diversity and late diagnosis, early detection poses a significant challenge crucial for effective treatment. The advancement of deep learning techniques, particularly supervised algorithms, has significantly propelled pancreatic tumor detection in the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.02653v1-abstract-full').style.display = 'inline'; document.getElementById('2406.02653v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.02653v1-abstract-full" style="display: none;"> Despite the advances in medicine, cancer has remained a formidable challenge. Particularly in the case of pancreatic tumors, characterized by their diversity and late diagnosis, early detection poses a significant challenge crucial for effective treatment. The advancement of deep learning techniques, particularly supervised algorithms, has significantly propelled pancreatic tumor detection in the medical field. However, supervised deep learning approaches necessitate extensive labeled medical images for training, yet acquiring such annotations is both limited and costly. Conversely, weakly supervised anomaly detection methods, requiring only image-level annotations, have garnered interest. Existing methodologies predominantly hinge on generative adversarial networks (GANs) or autoencoder models, which can pose complexity in training and, these models may face difficulties in accurately preserving fine image details. This research presents a novel approach to pancreatic tumor detection, employing weak supervision anomaly detection through denoising diffusion algorithms. By incorporating a deterministic iterative process of adding and removing noise along with classifier guidance, the method enables seamless translation of images between diseased and healthy subjects, resulting in detailed anomaly maps without requiring complex training protocols and segmentation masks. This study explores denoising diffusion models as a recent advancement over traditional generative models like GANs, contributing to the field of pancreatic tumor detection. Recognizing the low survival rates of pancreatic cancer, this study emphasizes the need for continued research to leverage diffusion models' efficiency in medical segmentation tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.02653v1-abstract-full').style.display = 'none'; document.getElementById('2406.02653v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.02009">arXiv:2406.02009</a> <span> [<a href="https://arxiv.org/pdf/2406.02009">pdf</a>, <a href="https://arxiv.org/format/2406.02009">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Phonetic Enhanced Language Modeling for Text-to-Speech Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhou%2C+K">Kun Zhou</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+S">Shengkui Zhao</a>, <a href="/search/eess?searchtype=author&query=Ma%2C+Y">Yukun Ma</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+C">Chong Zhang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+H">Hao Wang</a>, <a href="/search/eess?searchtype=author&query=Ng%2C+D">Dianwen Ng</a>, <a href="/search/eess?searchtype=author&query=Ni%2C+C">Chongjia Ni</a>, <a href="/search/eess?searchtype=author&query=Hieu%2C+N+T">Nguyen Trung Hieu</a>, <a href="/search/eess?searchtype=author&query=Yip%2C+J+Q">Jia Qi Yip</a>, <a href="/search/eess?searchtype=author&query=Ma%2C+B">Bin Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.02009v2-abstract-short" style="display: inline;"> Recent language model-based text-to-speech (TTS) frameworks demonstrate scalability and in-context learning capabilities. However, they suffer from robustness issues due to the accumulation of errors in speech unit predictions during autoregressive language modeling. In this paper, we propose a phonetic enhanced language modeling method to improve the performance of TTS models. We leverage self-su… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.02009v2-abstract-full').style.display = 'inline'; document.getElementById('2406.02009v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.02009v2-abstract-full" style="display: none;"> Recent language model-based text-to-speech (TTS) frameworks demonstrate scalability and in-context learning capabilities. However, they suffer from robustness issues due to the accumulation of errors in speech unit predictions during autoregressive language modeling. In this paper, we propose a phonetic enhanced language modeling method to improve the performance of TTS models. We leverage self-supervised representations that are phonetically rich as the training target for the autoregressive language model. Subsequently, a non-autoregressive model is employed to predict discrete acoustic codecs that contain fine-grained acoustic details. The TTS model focuses solely on linguistic modeling during autoregressive training, thereby reducing the error propagation that occurs in non-autoregressive training. Both objective and subjective evaluations validate the effectiveness of our proposed method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.02009v2-abstract-full').style.display = 'none'; document.getElementById('2406.02009v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by Interspeech 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.20279">arXiv:2405.20279</a> <span> [<a href="https://arxiv.org/pdf/2405.20279">pdf</a>, <a href="https://arxiv.org/format/2405.20279">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> CV-VAE: A Compatible Video VAE for Latent Generative Video Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhao%2C+S">Sijie Zhao</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Y">Yong Zhang</a>, <a href="/search/eess?searchtype=author&query=Cun%2C+X">Xiaodong Cun</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+S">Shaoshu Yang</a>, <a href="/search/eess?searchtype=author&query=Niu%2C+M">Muyao Niu</a>, <a href="/search/eess?searchtype=author&query=Li%2C+X">Xiaoyu Li</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+W">Wenbo Hu</a>, <a href="/search/eess?searchtype=author&query=Shan%2C+Y">Ying Shan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.20279v2-abstract-short" style="display: inline;"> Spatio-temporal compression of videos, utilizing networks such as Variational Autoencoders (VAE), plays a crucial role in OpenAI's SORA and numerous other video generative models. For instance, many LLM-like video models learn the distribution of discrete tokens derived from 3D VAEs within the VQVAE framework, while most diffusion-based video models capture the distribution of continuous latent ex… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.20279v2-abstract-full').style.display = 'inline'; document.getElementById('2405.20279v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.20279v2-abstract-full" style="display: none;"> Spatio-temporal compression of videos, utilizing networks such as Variational Autoencoders (VAE), plays a crucial role in OpenAI's SORA and numerous other video generative models. For instance, many LLM-like video models learn the distribution of discrete tokens derived from 3D VAEs within the VQVAE framework, while most diffusion-based video models capture the distribution of continuous latent extracted by 2D VAEs without quantization. The temporal compression is simply realized by uniform frame sampling which results in unsmooth motion between consecutive frames. Currently, there lacks of a commonly used continuous video (3D) VAE for latent diffusion-based video models in the research community. Moreover, since current diffusion-based approaches are often implemented using pre-trained text-to-image (T2I) models, directly training a video VAE without considering the compatibility with existing T2I models will result in a latent space gap between them, which will take huge computational resources for training to bridge the gap even with the T2I models as initialization. To address this issue, we propose a method for training a video VAE of latent video models, namely CV-VAE, whose latent space is compatible with that of a given image VAE, e.g., image VAE of Stable Diffusion (SD). The compatibility is achieved by the proposed novel latent space regularization, which involves formulating a regularization loss using the image VAE. Benefiting from the latent space compatibility, video models can be trained seamlessly from pre-trained T2I or video models in a truly spatio-temporally compressed latent space, rather than simply sampling video frames at equal intervals. With our CV-VAE, existing video models can generate four times more frames with minimal finetuning. Extensive experiments are conducted to demonstrate the effectiveness of the proposed video VAE. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.20279v2-abstract-full').style.display = 'none'; document.getElementById('2405.20279v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Page: https://ailab-cvc.github.io/cvvae/index.html</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.17809">arXiv:2405.17809</a> <span> [<a href="https://arxiv.org/pdf/2405.17809">pdf</a>, <a href="https://arxiv.org/format/2405.17809">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> TransVIP: Speech to Speech Translation System with Voice and Isochrony Preservation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Le%2C+C">Chenyang Le</a>, <a href="/search/eess?searchtype=author&query=Qian%2C+Y">Yao Qian</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+D">Dongmei Wang</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+L">Long Zhou</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+S">Shujie Liu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+X">Xiaofei Wang</a>, <a href="/search/eess?searchtype=author&query=Yousefi%2C+M">Midia Yousefi</a>, <a href="/search/eess?searchtype=author&query=Qian%2C+Y">Yanmin Qian</a>, <a href="/search/eess?searchtype=author&query=Li%2C+J">Jinyu Li</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+S">Sheng Zhao</a>, <a href="/search/eess?searchtype=author&query=Zeng%2C+M">Michael Zeng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.17809v3-abstract-short" style="display: inline;"> There is a rising interest and trend in research towards directly translating speech from one language to another, known as end-to-end speech-to-speech translation. However, most end-to-end models struggle to outperform cascade models, i.e., a pipeline framework by concatenating speech recognition, machine translation and text-to-speech models. The primary challenges stem from the inherent complex… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.17809v3-abstract-full').style.display = 'inline'; document.getElementById('2405.17809v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.17809v3-abstract-full" style="display: none;"> There is a rising interest and trend in research towards directly translating speech from one language to another, known as end-to-end speech-to-speech translation. However, most end-to-end models struggle to outperform cascade models, i.e., a pipeline framework by concatenating speech recognition, machine translation and text-to-speech models. The primary challenges stem from the inherent complexities involved in direct translation tasks and the scarcity of data. In this study, we introduce a novel model framework TransVIP that leverages diverse datasets in a cascade fashion yet facilitates end-to-end inference through joint probability. Furthermore, we propose two separated encoders to preserve the speaker's voice characteristics and isochrony from the source speech during the translation process, making it highly suitable for scenarios such as video dubbing. Our experiments on the French-English language pair demonstrate that our model outperforms the current state-of-the-art speech-to-speech translation model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.17809v3-abstract-full').style.display = 'none'; document.getElementById('2405.17809v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Neural Information Processing Systems, poster</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.03254">arXiv:2405.03254</a> <span> [<a href="https://arxiv.org/pdf/2405.03254">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Automatic Assessment of Dysarthria Using Audio-visual Vowel Graph Attention Network </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Liu%2C+X">Xiaokang Liu</a>, <a href="/search/eess?searchtype=author&query=Du%2C+X">Xiaoxia Du</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+J">Juan Liu</a>, <a href="/search/eess?searchtype=author&query=Su%2C+R">Rongfeng Su</a>, <a href="/search/eess?searchtype=author&query=Ng%2C+M+L">Manwa Lawrence Ng</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Y">Yumei Zhang</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+Y">Yudong Yang</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+S">Shaofeng Zhao</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+L">Lan Wang</a>, <a href="/search/eess?searchtype=author&query=Yan%2C+N">Nan Yan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.03254v2-abstract-short" style="display: inline;"> Automatic assessment of dysarthria remains a highly challenging task due to high variability in acoustic signals and the limited data. Currently, research on the automatic assessment of dysarthria primarily focuses on two approaches: one that utilizes expert features combined with machine learning, and the other that employs data-driven deep learning methods to extract representations. Research ha… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.03254v2-abstract-full').style.display = 'inline'; document.getElementById('2405.03254v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.03254v2-abstract-full" style="display: none;"> Automatic assessment of dysarthria remains a highly challenging task due to high variability in acoustic signals and the limited data. Currently, research on the automatic assessment of dysarthria primarily focuses on two approaches: one that utilizes expert features combined with machine learning, and the other that employs data-driven deep learning methods to extract representations. Research has demonstrated that expert features are effective in representing pathological characteristics, while deep learning methods excel at uncovering latent features. Therefore, integrating the advantages of expert features and deep learning to construct a neural network architecture based on expert knowledge may be beneficial for interpretability and assessment performance. In this context, the present paper proposes a vowel graph attention network based on audio-visual information, which effectively integrates the strengths of expert knowledges and deep learning. Firstly, various features were combined as inputs, including knowledge based acoustical features and deep learning based pre-trained representations. Secondly, the graph network structure based on vowel space theory was designed, allowing for a deep exploration of spatial correlations among vowels. Finally, visual information was incorporated into the model to further enhance its robustness and generalizability. The method exhibited superior performance in regression experiments targeting Frenchay scores compared to existing approaches. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.03254v2-abstract-full').style.display = 'none'; document.getElementById('2405.03254v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 7 figures, 7 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.16825">arXiv:2404.16825</a> <span> [<a href="https://arxiv.org/pdf/2404.16825">pdf</a>, <a href="https://arxiv.org/format/2404.16825">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> ResVR: Joint Rescaling and Viewport Rendering of Omnidirectional Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+W">Weiqi Li</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+S">Shijie Zhao</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+B">Bin Chen</a>, <a href="/search/eess?searchtype=author&query=Cheng%2C+X">Xinhua Cheng</a>, <a href="/search/eess?searchtype=author&query=Li%2C+J">Junlin Li</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+L">Li Zhang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+J">Jian Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.16825v1-abstract-short" style="display: inline;"> With the advent of virtual reality technology, omnidirectional image (ODI) rescaling techniques are increasingly embraced for reducing transmitted and stored file sizes while preserving high image quality. Despite this progress, current ODI rescaling methods predominantly focus on enhancing the quality of images in equirectangular projection (ERP) format, which overlooks the fact that the content… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.16825v1-abstract-full').style.display = 'inline'; document.getElementById('2404.16825v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.16825v1-abstract-full" style="display: none;"> With the advent of virtual reality technology, omnidirectional image (ODI) rescaling techniques are increasingly embraced for reducing transmitted and stored file sizes while preserving high image quality. Despite this progress, current ODI rescaling methods predominantly focus on enhancing the quality of images in equirectangular projection (ERP) format, which overlooks the fact that the content viewed on head mounted displays (HMDs) is actually a rendered viewport instead of an ERP image. In this work, we emphasize that focusing solely on ERP quality results in inferior viewport visual experiences for users. Thus, we propose ResVR, which is the first comprehensive framework for the joint Rescaling and Viewport Rendering of ODIs. ResVR allows obtaining LR ERP images for transmission while rendering high-quality viewports for users to watch on HMDs. In our ResVR, a novel discrete pixel sampling strategy is developed to tackle the complex mapping between the viewport and ERP, enabling end-to-end training of ResVR pipeline. Furthermore, a spherical pixel shape representation technique is innovatively derived from spherical differentiation to significantly improve the visual quality of rendered viewports. Extensive experiments demonstrate that our ResVR outperforms existing methods in viewport rendering tasks across different fields of view, resolutions, and view directions while keeping a low transmission overhead. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.16825v1-abstract-full').style.display = 'none'; document.getElementById('2404.16825v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.10343">arXiv:2404.10343</a> <span> [<a href="https://arxiv.org/pdf/2404.10343">pdf</a>, <a href="https://arxiv.org/format/2404.10343">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> The Ninth NTIRE 2024 Efficient Super-Resolution Challenge Report </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ren%2C+B">Bin Ren</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yawei Li</a>, <a href="/search/eess?searchtype=author&query=Mehta%2C+N">Nancy Mehta</a>, <a href="/search/eess?searchtype=author&query=Timofte%2C+R">Radu Timofte</a>, <a href="/search/eess?searchtype=author&query=Yu%2C+H">Hongyuan Yu</a>, <a href="/search/eess?searchtype=author&query=Wan%2C+C">Cheng Wan</a>, <a href="/search/eess?searchtype=author&query=Hong%2C+Y">Yuxin Hong</a>, <a href="/search/eess?searchtype=author&query=Han%2C+B">Bingnan Han</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+Z">Zhuoyuan Wu</a>, <a href="/search/eess?searchtype=author&query=Zou%2C+Y">Yajun Zou</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+Y">Yuqing Liu</a>, <a href="/search/eess?searchtype=author&query=Li%2C+J">Jizhe Li</a>, <a href="/search/eess?searchtype=author&query=He%2C+K">Keji He</a>, <a href="/search/eess?searchtype=author&query=Fan%2C+C">Chao Fan</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+H">Heng Zhang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+X">Xiaolin Zhang</a>, <a href="/search/eess?searchtype=author&query=Yin%2C+X">Xuanwu Yin</a>, <a href="/search/eess?searchtype=author&query=Zuo%2C+K">Kunlong Zuo</a>, <a href="/search/eess?searchtype=author&query=Liao%2C+B">Bohao Liao</a>, <a href="/search/eess?searchtype=author&query=Xia%2C+P">Peizhe Xia</a>, <a href="/search/eess?searchtype=author&query=Peng%2C+L">Long Peng</a>, <a href="/search/eess?searchtype=author&query=Du%2C+Z">Zhibo Du</a>, <a href="/search/eess?searchtype=author&query=Di%2C+X">Xin Di</a>, <a href="/search/eess?searchtype=author&query=Li%2C+W">Wangkai Li</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yang Wang</a> , et al. (109 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.10343v2-abstract-short" style="display: inline;"> This paper provides a comprehensive review of the NTIRE 2024 challenge, focusing on efficient single-image super-resolution (ESR) solutions and their outcomes. The task of this challenge is to super-resolve an input image with a magnification factor of x4 based on pairs of low and corresponding high-resolution images. The primary objective is to develop networks that optimize various aspects such… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.10343v2-abstract-full').style.display = 'inline'; document.getElementById('2404.10343v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.10343v2-abstract-full" style="display: none;"> This paper provides a comprehensive review of the NTIRE 2024 challenge, focusing on efficient single-image super-resolution (ESR) solutions and their outcomes. The task of this challenge is to super-resolve an input image with a magnification factor of x4 based on pairs of low and corresponding high-resolution images. The primary objective is to develop networks that optimize various aspects such as runtime, parameters, and FLOPs, while still maintaining a peak signal-to-noise ratio (PSNR) of approximately 26.90 dB on the DIV2K_LSDIR_valid dataset and 26.99 dB on the DIV2K_LSDIR_test dataset. In addition, this challenge has 4 tracks including the main track (overall performance), sub-track 1 (runtime), sub-track 2 (FLOPs), and sub-track 3 (parameters). In the main track, all three metrics (ie runtime, FLOPs, and parameter count) were considered. The ranking of the main track is calculated based on a weighted sum-up of the scores of all other sub-tracks. In sub-track 1, the practical runtime performance of the submissions was evaluated, and the corresponding score was used to determine the ranking. In sub-track 2, the number of FLOPs was considered. The score calculated based on the corresponding FLOPs was used to determine the ranking. In sub-track 3, the number of parameters was considered. The score calculated based on the corresponding parameters was used to determine the ranking. RLFN is set as the baseline for efficiency measurement. The challenge had 262 registered participants, and 34 teams made valid submissions. They gauge the state-of-the-art in efficient single-image super-resolution. To facilitate the reproducibility of the challenge and enable other researchers to build upon these findings, the code and the pre-trained model of validated solutions are made publicly available at https://github.com/Amazingren/NTIRE2024_ESR/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.10343v2-abstract-full').style.display = 'none'; document.getElementById('2404.10343v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The report paper of NTIRE2024 Efficient Super-resolution, accepted by CVPRW2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.06690">arXiv:2404.06690</a> <span> [<a href="https://arxiv.org/pdf/2404.06690">pdf</a>, <a href="https://arxiv.org/format/2404.06690">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> CoVoMix: Advancing Zero-Shot Speech Generation for Human-like Multi-talker Conversations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+L">Leying Zhang</a>, <a href="/search/eess?searchtype=author&query=Qian%2C+Y">Yao Qian</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+L">Long Zhou</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+S">Shujie Liu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+D">Dongmei Wang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+X">Xiaofei Wang</a>, <a href="/search/eess?searchtype=author&query=Yousefi%2C+M">Midia Yousefi</a>, <a href="/search/eess?searchtype=author&query=Qian%2C+Y">Yanmin Qian</a>, <a href="/search/eess?searchtype=author&query=Li%2C+J">Jinyu Li</a>, <a href="/search/eess?searchtype=author&query=He%2C+L">Lei He</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+S">Sheng Zhao</a>, <a href="/search/eess?searchtype=author&query=Zeng%2C+M">Michael Zeng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.06690v2-abstract-short" style="display: inline;"> Recent advancements in zero-shot text-to-speech (TTS) modeling have led to significant strides in generating high-fidelity and diverse speech. However, dialogue generation, along with achieving human-like naturalness in speech, continues to be a challenge. In this paper, we introduce CoVoMix: Conversational Voice Mixture Generation, a novel model for zero-shot, human-like, multi-speaker, multi-rou… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.06690v2-abstract-full').style.display = 'inline'; document.getElementById('2404.06690v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.06690v2-abstract-full" style="display: none;"> Recent advancements in zero-shot text-to-speech (TTS) modeling have led to significant strides in generating high-fidelity and diverse speech. However, dialogue generation, along with achieving human-like naturalness in speech, continues to be a challenge. In this paper, we introduce CoVoMix: Conversational Voice Mixture Generation, a novel model for zero-shot, human-like, multi-speaker, multi-round dialogue speech generation. CoVoMix first converts dialogue text into multiple streams of discrete tokens, with each token stream representing semantic information for individual talkers. These token streams are then fed into a flow-matching based acoustic model to generate mixed mel-spectrograms. Finally, the speech waveforms are produced using a HiFi-GAN model. Furthermore, we devise a comprehensive set of metrics for measuring the effectiveness of dialogue modeling and generation. Our experimental results show that CoVoMix can generate dialogues that are not only human-like in their naturalness and coherence but also involve multiple talkers engaging in multiple rounds of conversation. This is exemplified by instances generated in a single channel where one speaker's utterance is seamlessly mixed with another's interjections or laughter, indicating the latter's role as an attentive listener. Audio samples are available at https://aka.ms/covomix. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.06690v2-abstract-full').style.display = 'none'; document.getElementById('2404.06690v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.06054">arXiv:2404.06054</a> <span> [<a href="https://arxiv.org/pdf/2404.06054">pdf</a>, <a href="https://arxiv.org/format/2404.06054">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Pseudo MIMO (pMIMO): An Energy and Spectral Efficient MIMO-OFDM System </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+S">Sen Wang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+T">Tianxiong Wang</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+S">Shulun Zhao</a>, <a href="/search/eess?searchtype=author&query=Feng%2C+Z">Zhen Feng</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+G">Guangyi Liu</a>, <a href="/search/eess?searchtype=author&query=Cui%2C+C">Chunfeng Cui</a>, <a href="/search/eess?searchtype=author&query=I%2C+C">Chih-Lin I</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+J">Jiangzhou Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.06054v1-abstract-short" style="display: inline;"> This article introduces an energy and spectral efficient multiple-input multiple-output orthogonal frequency division multiplexing (MIMO-OFDM) transmission scheme designed for the future sixth generation (6G) wireless communication networks. The approach involves connecting each receiving radio frequency (RF) chain with multiple antenna elements and conducting sample-level adjustments for receivin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.06054v1-abstract-full').style.display = 'inline'; document.getElementById('2404.06054v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.06054v1-abstract-full" style="display: none;"> This article introduces an energy and spectral efficient multiple-input multiple-output orthogonal frequency division multiplexing (MIMO-OFDM) transmission scheme designed for the future sixth generation (6G) wireless communication networks. The approach involves connecting each receiving radio frequency (RF) chain with multiple antenna elements and conducting sample-level adjustments for receiving beamforming patterns. The proposed system architecture and the dedicated signal processing methods enable the scheme to transmit a bigger number of parallel data streams than the number of receiving RF chains, achieving a spectral efficiency performance close to that of a fully digital (FD) MIMO system with the same number of antenna elements, each equipped with an RF chain. We refer to this system as a ''pseudo MIMO'' system due to its ability to mimic the functionality of additional invisible RF chains. The article begins with introducing the underlying principles of pseudo MIMO and discussing potential hardware architectures for its implementation. We then highlight several advantages of integrating pseudo MIMO into next-generation wireless networks. To demonstrate the superiority of our proposed pseudo MIMO transmission scheme to conventional MIMO systems, simulation results are presented. Additionally, we validate the feasibility of this new scheme by building the first pseudo MIMO prototype. Furthermore, we present some key challenges and outline potential directions for future research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.06054v1-abstract-full').style.display = 'none'; document.getElementById('2404.06054v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.03204">arXiv:2404.03204</a> <span> [<a href="https://arxiv.org/pdf/2404.03204">pdf</a>, <a href="https://arxiv.org/format/2404.03204">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> RALL-E: Robust Codec Language Modeling with Chain-of-Thought Prompting for Text-to-Speech Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Xin%2C+D">Detai Xin</a>, <a href="/search/eess?searchtype=author&query=Tan%2C+X">Xu Tan</a>, <a href="/search/eess?searchtype=author&query=Shen%2C+K">Kai Shen</a>, <a href="/search/eess?searchtype=author&query=Ju%2C+Z">Zeqian Ju</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+D">Dongchao Yang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yuancheng Wang</a>, <a href="/search/eess?searchtype=author&query=Takamichi%2C+S">Shinnosuke Takamichi</a>, <a href="/search/eess?searchtype=author&query=Saruwatari%2C+H">Hiroshi Saruwatari</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+S">Shujie Liu</a>, <a href="/search/eess?searchtype=author&query=Li%2C+J">Jinyu Li</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+S">Sheng Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.03204v3-abstract-short" style="display: inline;"> We present RALL-E, a robust language modeling method for text-to-speech (TTS) synthesis. While previous work based on large language models (LLMs) shows impressive performance on zero-shot TTS, such methods often suffer from poor robustness, such as unstable prosody (weird pitch and rhythm/duration) and a high word error rate (WER), due to the autoregressive prediction style of language models. Th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.03204v3-abstract-full').style.display = 'inline'; document.getElementById('2404.03204v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.03204v3-abstract-full" style="display: none;"> We present RALL-E, a robust language modeling method for text-to-speech (TTS) synthesis. While previous work based on large language models (LLMs) shows impressive performance on zero-shot TTS, such methods often suffer from poor robustness, such as unstable prosody (weird pitch and rhythm/duration) and a high word error rate (WER), due to the autoregressive prediction style of language models. The core idea behind RALL-E is chain-of-thought (CoT) prompting, which decomposes the task into simpler steps to enhance the robustness of LLM-based TTS. To accomplish this idea, RALL-E first predicts prosody features (pitch and duration) of the input text and uses them as intermediate conditions to predict speech tokens in a CoT style. Second, RALL-E utilizes the predicted duration prompt to guide the computing of self-attention weights in Transformer to enforce the model to focus on the corresponding phonemes and prosody features when predicting speech tokens. Results of comprehensive objective and subjective evaluations demonstrate that, compared to a powerful baseline method VALL-E, RALL-E significantly improves the WER of zero-shot TTS from $5.6\%$ (without reranking) and $1.7\%$ (with reranking) to $2.5\%$ and $1.0\%$, respectively. Furthermore, we demonstrate that RALL-E correctly synthesizes sentences that are hard for VALL-E and reduces the error rate from $68\%$ to $4\%$. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.03204v3-abstract-full').style.display = 'none'; document.getElementById('2404.03204v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.01164">arXiv:2404.01164</a> <span> [<a href="https://arxiv.org/pdf/2404.01164">pdf</a>, <a href="https://arxiv.org/ps/2404.01164">ps</a>, <a href="https://arxiv.org/format/2404.01164">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Unified Predefined-time Stability Conditions of Nonlinear Systems with Lyapunov Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Xiao%2C+B">Bing Xiao</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+H">Haichao Zhang</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+S">Shijie Zhao</a>, <a href="/search/eess?searchtype=author&query=Cao%2C+L">Lu Cao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.01164v1-abstract-short" style="display: inline;"> This brief gives a set of unified Lyapunov stability conditions to guarantee the predefined-time/finite-time stability of a dynamical systems. The derived Lyapunov theorem for autonomous systems establishes equivalence with existing theorems on predefined-time/finite-time stability. The findings proposed herein develop a nonsingular sliding mode control framework for an Euler-Lagrange system to an… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.01164v1-abstract-full').style.display = 'inline'; document.getElementById('2404.01164v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.01164v1-abstract-full" style="display: none;"> This brief gives a set of unified Lyapunov stability conditions to guarantee the predefined-time/finite-time stability of a dynamical systems. The derived Lyapunov theorem for autonomous systems establishes equivalence with existing theorems on predefined-time/finite-time stability. The findings proposed herein develop a nonsingular sliding mode control framework for an Euler-Lagrange system to analyze its stability, and its upper bound for the settling time can be arbitrarily determined a priori through predefined time constant. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.01164v1-abstract-full').style.display = 'none'; document.getElementById('2404.01164v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Zhao%2C+S&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Zhao%2C+S&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhao%2C+S&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhao%2C+S&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhao%2C+S&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhao%2C+S&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>