Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 300 results for author: <span class="mathjax">Zhou, J</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/eess" aria-role="search"> Searching in archive <strong>eess</strong>. <a href="/search/?searchtype=author&query=Zhou%2C+J">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Zhou, J"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Zhou%2C+J&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Zhou, J"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Zhou%2C+J&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Zhou%2C+J&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhou%2C+J&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhou%2C+J&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhou%2C+J&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhou%2C+J&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhou%2C+J&start=250" class="pagination-link " aria-label="Page 6" aria-current="page">6 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhou%2C+J&start=300" class="pagination-link " aria-label="Page 7" aria-current="page">7 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11128">arXiv:2502.11128</a> <span> [<a href="https://arxiv.org/pdf/2502.11128">pdf</a>, <a href="https://arxiv.org/format/2502.11128">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> FELLE: Autoregressive Speech Synthesis with Token-Wise Coarse-to-Fine Flow Matching </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+H">Hui Wang</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+S">Shujie Liu</a>, <a href="/search/eess?searchtype=author&query=Meng%2C+L">Lingwei Meng</a>, <a href="/search/eess?searchtype=author&query=Li%2C+J">Jinyu Li</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+Y">Yifan Yang</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+S">Shiwan Zhao</a>, <a href="/search/eess?searchtype=author&query=Sun%2C+H">Haiyang Sun</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+Y">Yanqing Liu</a>, <a href="/search/eess?searchtype=author&query=Sun%2C+H">Haoqin Sun</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+J">Jiaming Zhou</a>, <a href="/search/eess?searchtype=author&query=Lu%2C+Y">Yan Lu</a>, <a href="/search/eess?searchtype=author&query=Qin%2C+Y">Yong Qin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11128v1-abstract-short" style="display: inline;"> To advance continuous-valued token modeling and temporal-coherence enforcement, we propose FELLE, an autoregressive model that integrates language modeling with token-wise flow matching. By leveraging the autoregressive nature of language models and the generative efficacy of flow matching, FELLE effectively predicts continuous-valued tokens (mel-spectrograms). For each continuous-valued token, FE… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11128v1-abstract-full').style.display = 'inline'; document.getElementById('2502.11128v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11128v1-abstract-full" style="display: none;"> To advance continuous-valued token modeling and temporal-coherence enforcement, we propose FELLE, an autoregressive model that integrates language modeling with token-wise flow matching. By leveraging the autoregressive nature of language models and the generative efficacy of flow matching, FELLE effectively predicts continuous-valued tokens (mel-spectrograms). For each continuous-valued token, FELLE modifies the general prior distribution in flow matching by incorporating information from the previous step, improving coherence and stability. Furthermore, to enhance synthesis quality, FELLE introduces a coarse-to-fine flow-matching mechanism, generating continuous-valued tokens hierarchically, conditioned on the language model's output. Experimental results demonstrate the potential of incorporating flow-matching techniques in autoregressive mel-spectrogram modeling, leading to significant improvements in TTS generation quality, as shown in https://aka.ms/felle. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11128v1-abstract-full').style.display = 'none'; document.getElementById('2502.11128v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06289">arXiv:2502.06289</a> <span> [<a href="https://arxiv.org/pdf/2502.06289">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Is an Ultra Large Natural Image-Based Foundation Model Superior to a Retina-Specific Model for Detecting Ocular and Systemic Diseases? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Hou%2C+Q">Qingshan Hou</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+Y">Yukun Zhou</a>, <a href="/search/eess?searchtype=author&query=Goh%2C+J+H+L">Jocelyn Hui Lin Goh</a>, <a href="/search/eess?searchtype=author&query=Zou%2C+K">Ke Zou</a>, <a href="/search/eess?searchtype=author&query=Yew%2C+S+M+E">Samantha Min Er Yew</a>, <a href="/search/eess?searchtype=author&query=Srinivasan%2C+S">Sahana Srinivasan</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+M">Meng Wang</a>, <a href="/search/eess?searchtype=author&query=Lo%2C+T">Thaddaeus Lo</a>, <a href="/search/eess?searchtype=author&query=Lei%2C+X">Xiaofeng Lei</a>, <a href="/search/eess?searchtype=author&query=Wagner%2C+S+K">Siegfried K. Wagner</a>, <a href="/search/eess?searchtype=author&query=Chia%2C+M+A">Mark A. Chia</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+D">Dawei Yang</a>, <a href="/search/eess?searchtype=author&query=Jiang%2C+H">Hongyang Jiang</a>, <a href="/search/eess?searchtype=author&query=Ran%2C+A">AnRan Ran</a>, <a href="/search/eess?searchtype=author&query=Santos%2C+R">Rui Santos</a>, <a href="/search/eess?searchtype=author&query=Somfai%2C+G+M">Gabor Mark Somfai</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+J+H">Juan Helen Zhou</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+H">Haoyu Chen</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Q">Qingyu Chen</a>, <a href="/search/eess?searchtype=author&query=Cheung%2C+C+Y">Carol Yim-Lui Cheung</a>, <a href="/search/eess?searchtype=author&query=Keane%2C+P+A">Pearse A. Keane</a>, <a href="/search/eess?searchtype=author&query=Tham%2C+Y+C">Yih Chung Tham</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06289v1-abstract-short" style="display: inline;"> The advent of foundation models (FMs) is transforming medical domain. In ophthalmology, RETFound, a retina-specific FM pre-trained sequentially on 1.4 million natural images and 1.6 million retinal images, has demonstrated high adaptability across clinical applications. Conversely, DINOv2, a general-purpose vision FM pre-trained on 142 million natural images, has shown promise in non-medical domai… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06289v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06289v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06289v1-abstract-full" style="display: none;"> The advent of foundation models (FMs) is transforming medical domain. In ophthalmology, RETFound, a retina-specific FM pre-trained sequentially on 1.4 million natural images and 1.6 million retinal images, has demonstrated high adaptability across clinical applications. Conversely, DINOv2, a general-purpose vision FM pre-trained on 142 million natural images, has shown promise in non-medical domains. However, its applicability to clinical tasks remains underexplored. To address this, we conducted head-to-head evaluations by fine-tuning RETFound and three DINOv2 models (large, base, small) for ocular disease detection and systemic disease prediction tasks, across eight standardized open-source ocular datasets, as well as the Moorfields AlzEye and the UK Biobank datasets. DINOv2-large model outperformed RETFound in detecting diabetic retinopathy (AUROC=0.850-0.952 vs 0.823-0.944, across three datasets, all P<=0.007) and multi-class eye diseases (AUROC=0.892 vs. 0.846, P<0.001). In glaucoma, DINOv2-base model outperformed RETFound (AUROC=0.958 vs 0.940, P<0.001). Conversely, RETFound achieved superior performance over all DINOv2 models in predicting heart failure, myocardial infarction, and ischaemic stroke (AUROC=0.732-0.796 vs 0.663-0.771, all P<0.001). These trends persisted even with 10% of the fine-tuning data. These findings showcase the distinct scenarios where general-purpose and domain-specific FMs excel, highlighting the importance of aligning FM selection with task-specific requirements to optimise clinical performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06289v1-abstract-full').style.display = 'none'; document.getElementById('2502.06289v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05845">arXiv:2502.05845</a> <span> [<a href="https://arxiv.org/pdf/2502.05845">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Exploiting the Hidden Capacity of MMC Through Accurate Quantification of Modulation Indices </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Sun%2C+Q">Qianhao Sun</a>, <a href="/search/eess?searchtype=author&query=Meng%2C+J">Jingwei Meng</a>, <a href="/search/eess?searchtype=author&query=Li%2C+R">Ruofan Li</a>, <a href="/search/eess?searchtype=author&query=Xia%2C+M">Mingchao Xia</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Q">Qifang Chen</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+J">Jiejie Zhou</a>, <a href="/search/eess?searchtype=author&query=Fan%2C+M">Meiqi Fan</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+P">Peiqian Guo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05845v1-abstract-short" style="display: inline;"> The modular multilevel converter (MMC) has become increasingly important in voltage-source converter-based high-voltage direct current (VSC-HVDC) systems. Direct and indirect modulation are widely used as mainstream modulation techniques in MMCs. However, due to the challenge of quantitatively evaluating the operation of different modulation schemes, the academic and industrial communities still h… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05845v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05845v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05845v1-abstract-full" style="display: none;"> The modular multilevel converter (MMC) has become increasingly important in voltage-source converter-based high-voltage direct current (VSC-HVDC) systems. Direct and indirect modulation are widely used as mainstream modulation techniques in MMCs. However, due to the challenge of quantitatively evaluating the operation of different modulation schemes, the academic and industrial communities still hold differing opinions on their performance. To address this controversy, this paper employs the state-of-the-art computational methods and quantitative metrics to compare the performance among different modulation schemes. The findings indicate that direct modulation offers superior modulation potential for MMCs, highlighting its higher ac voltage output capability and broader linear PQ operation region. Conversely, indirect modulation is disadvantaged in linear modulation, which indicates inferior output voltage capability. Furthermore, this paper delves into the conditions whereby direct and indirect modulation techniques become equivalent in steady-state. The study findings suggest that the modulation capability of direct modulation is the same as that of indirect modulation in steady-state when additional controls, including closed-loop capacitor voltage control and circulating current suppression control (CCSC), are simultaneously active. Simulation and experiments verify the correctness and validity. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05845v1-abstract-full').style.display = 'none'; document.getElementById('2502.05845v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.11028">arXiv:2501.11028</a> <span> [<a href="https://arxiv.org/pdf/2501.11028">pdf</a>, <a href="https://arxiv.org/format/2501.11028">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Few-shot Human Motion Recognition through Multi-Aspect mmWave FMCW Radar Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Fan%2C+H">Hao Fan</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+L">Lingfeng Chen</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+C">Chengbai Xu</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+J">Jiadong Zhou</a>, <a href="/search/eess?searchtype=author&query=Dai%2C+Y">Yongpeng Dai</a>, <a href="/search/eess?searchtype=author&query=HU%2C+P">Panhe HU</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.11028v1-abstract-short" style="display: inline;"> Radar human motion recognition methods based on deep learning models has been a heated spot of remote sensing in recent years, yet the existing methods are mostly radial-oriented. In practical application, the test data could be multi-aspect and the sample number of each motion could be very limited, causing model overfitting and reduced recognition accuracy. This paper proposed channel-DN4, a mul… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.11028v1-abstract-full').style.display = 'inline'; document.getElementById('2501.11028v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.11028v1-abstract-full" style="display: none;"> Radar human motion recognition methods based on deep learning models has been a heated spot of remote sensing in recent years, yet the existing methods are mostly radial-oriented. In practical application, the test data could be multi-aspect and the sample number of each motion could be very limited, causing model overfitting and reduced recognition accuracy. This paper proposed channel-DN4, a multi-aspect few-shot human motion recognition method. First, local descriptors are introduced for a precise classification metric. Moreover, episodic training strategy was adopted to reduce model overfitting. To utilize the invariant sematic information in multi-aspect conditions, we considered channel attention after the embedding network to obtain precise implicit high-dimensional representation of sematic information. We tested the performance of channel-DN4 and methods for comparison on measured mmWave FMCW radar data. The proposed channel-DN4 produced competitive and convincing results, reaching the highest 87.533% recognition accuracy in 3-way 10-shot condition while other methods suffer from overfitting. Codes are available at: https://github.com/MountainChenCad/channel-DN4 <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.11028v1-abstract-full').style.display = 'none'; document.getElementById('2501.11028v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.10811">arXiv:2501.10811</a> <span> [<a href="https://arxiv.org/pdf/2501.10811">pdf</a>, <a href="https://arxiv.org/format/2501.10811">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> MusicEval: A Generative Music Corpus with Expert Ratings for Automatic Text-to-Music Evaluation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Liu%2C+C">Cheng Liu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+H">Hui Wang</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+J">Jinghua Zhao</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+S">Shiwan Zhao</a>, <a href="/search/eess?searchtype=author&query=Bu%2C+H">Hui Bu</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+X">Xin Xu</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+J">Jiaming Zhou</a>, <a href="/search/eess?searchtype=author&query=Sun%2C+H">Haoqin Sun</a>, <a href="/search/eess?searchtype=author&query=Qin%2C+Y">Yong Qin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.10811v1-abstract-short" style="display: inline;"> The technology for generating music from textual descriptions has seen rapid advancements. However, evaluating text-to-music (TTM) systems remains a significant challenge, primarily due to the difficulty of balancing performance and cost with existing objective and subjective evaluation methods. In this paper, we propose an automatic assessment task for TTM models to align with human perception. T… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10811v1-abstract-full').style.display = 'inline'; document.getElementById('2501.10811v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.10811v1-abstract-full" style="display: none;"> The technology for generating music from textual descriptions has seen rapid advancements. However, evaluating text-to-music (TTM) systems remains a significant challenge, primarily due to the difficulty of balancing performance and cost with existing objective and subjective evaluation methods. In this paper, we propose an automatic assessment task for TTM models to align with human perception. To address the TTM evaluation challenges posed by the professional requirements of music evaluation and the complexity of the relationship between text and music, we collect MusicEval, the first generative music assessment dataset. This dataset contains 2,748 music clips generated by 31 advanced and widely used models in response to 384 text prompts, along with 13,740 ratings from 14 music experts. Furthermore, we design a CLAP-based assessment model built on this dataset, and our experimental results validate the feasibility of the proposed task, providing a valuable reference for future development in TTM evaluation. The dataset is available at https://www.aishelltech.com/AISHELL_7A. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10811v1-abstract-full').style.display = 'none'; document.getElementById('2501.10811v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICASSP 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.06282">arXiv:2501.06282</a> <span> [<a href="https://arxiv.org/pdf/2501.06282">pdf</a>, <a href="https://arxiv.org/format/2501.06282">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> MinMo: A Multimodal Large Language Model for Seamless Voice Interaction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Chen%2C+Q">Qian Chen</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yafeng Chen</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yanni Chen</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+M">Mengzhe Chen</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yingda Chen</a>, <a href="/search/eess?searchtype=author&query=Deng%2C+C">Chong Deng</a>, <a href="/search/eess?searchtype=author&query=Du%2C+Z">Zhihao Du</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+R">Ruize Gao</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+C">Changfeng Gao</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+Z">Zhifu Gao</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yabin Li</a>, <a href="/search/eess?searchtype=author&query=Lv%2C+X">Xiang Lv</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+J">Jiaqing Liu</a>, <a href="/search/eess?searchtype=author&query=Luo%2C+H">Haoneng Luo</a>, <a href="/search/eess?searchtype=author&query=Ma%2C+B">Bin Ma</a>, <a href="/search/eess?searchtype=author&query=Ni%2C+C">Chongjia Ni</a>, <a href="/search/eess?searchtype=author&query=Shi%2C+X">Xian Shi</a>, <a href="/search/eess?searchtype=author&query=Tang%2C+J">Jialong Tang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+H">Hui Wang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+H">Hao Wang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+W">Wen Wang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yuxuan Wang</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+Y">Yunlan Xu</a>, <a href="/search/eess?searchtype=author&query=Yu%2C+F">Fan Yu</a>, <a href="/search/eess?searchtype=author&query=Yan%2C+Z">Zhijie Yan</a> , et al. (11 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.06282v1-abstract-short" style="display: inline;"> Recent advancements in large language models (LLMs) and multimodal speech-text models have laid the groundwork for seamless voice interactions, enabling real-time, natural, and human-like conversations. Previous models for voice interactions are categorized as native and aligned. Native models integrate speech and text processing in one framework but struggle with issues like differing sequence le… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06282v1-abstract-full').style.display = 'inline'; document.getElementById('2501.06282v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.06282v1-abstract-full" style="display: none;"> Recent advancements in large language models (LLMs) and multimodal speech-text models have laid the groundwork for seamless voice interactions, enabling real-time, natural, and human-like conversations. Previous models for voice interactions are categorized as native and aligned. Native models integrate speech and text processing in one framework but struggle with issues like differing sequence lengths and insufficient pre-training. Aligned models maintain text LLM capabilities but are often limited by small datasets and a narrow focus on speech tasks. In this work, we introduce MinMo, a Multimodal Large Language Model with approximately 8B parameters for seamless voice interaction. We address the main limitations of prior aligned multimodal models. We train MinMo through multiple stages of speech-to-text alignment, text-to-speech alignment, speech-to-speech alignment, and duplex interaction alignment, on 1.4 million hours of diverse speech data and a broad range of speech tasks. After the multi-stage training, MinMo achieves state-of-the-art performance across various benchmarks for voice comprehension and generation while maintaining the capabilities of text LLMs, and also facilitates full-duplex conversation, that is, simultaneous two-way communication between the user and the system. Moreover, we propose a novel and simple voice decoder that outperforms prior models in voice generation. The enhanced instruction-following capabilities of MinMo supports controlling speech generation based on user instructions, with various nuances including emotions, dialects, and speaking rates, and mimicking specific voices. For MinMo, the speech-to-text latency is approximately 100ms, full-duplex latency is approximately 600ms in theory and 800ms in practice. The MinMo project web page is https://funaudiollm.github.io/minmo, and the code and models will be released soon. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06282v1-abstract-full').style.display = 'none'; document.getElementById('2501.06282v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Work in progress. Authors are listed in alphabetical order by family name</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.02815">arXiv:2501.02815</a> <span> [<a href="https://arxiv.org/pdf/2501.02815">pdf</a>, <a href="https://arxiv.org/format/2501.02815">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Local Reactive Control for Mobile Manipulators with Whole-Body Safety in Complex Environments </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zheng%2C+C">Chunxin Zheng</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yulin Li</a>, <a href="/search/eess?searchtype=author&query=Song%2C+Z">Zhiyuan Song</a>, <a href="/search/eess?searchtype=author&query=Bi%2C+Z">Zhihai Bi</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+J">Jinni Zhou</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+B">Boyu Zhou</a>, <a href="/search/eess?searchtype=author&query=Ma%2C+J">Jun Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.02815v1-abstract-short" style="display: inline;"> Mobile manipulators typically encounter significant challenges in navigating narrow, cluttered environments due to their high-dimensional state spaces and complex kinematics. While reactive methods excel in dynamic settings, they struggle to efficiently incorporate complex, coupled constraints across the entire state space. In this work, we present a novel local reactive controller that reformulat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02815v1-abstract-full').style.display = 'inline'; document.getElementById('2501.02815v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.02815v1-abstract-full" style="display: none;"> Mobile manipulators typically encounter significant challenges in navigating narrow, cluttered environments due to their high-dimensional state spaces and complex kinematics. While reactive methods excel in dynamic settings, they struggle to efficiently incorporate complex, coupled constraints across the entire state space. In this work, we present a novel local reactive controller that reformulates the time-domain single-step problem into a multi-step optimization problem in the spatial domain, leveraging the propagation of a serial kinematic chain. This transformation facilitates the formulation of customized, decoupled link-specific constraints, which is further solved efficiently with augmented Lagrangian differential dynamic programming (AL-DDP). Our approach naturally absorbs spatial kinematic propagation in the forward pass and processes all link-specific constraints simultaneously during the backward pass, enhancing both constraint management and computational efficiency. Notably, in this framework, we formulate collision avoidance constraints for each link using accurate geometric models with extracted free regions, and this improves the maneuverability of the mobile manipulator in narrow, cluttered spaces. Experimental results showcase significant improvements in safety, efficiency, and task completion rates. These findings underscore the robustness of the proposed method, particularly in narrow, cluttered environments where conventional approaches could falter. The open-source project can be found at https://github.com/Chunx1nZHENG/MM-with-Whole-Body-Safety-Release.git. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02815v1-abstract-full').style.display = 'none'; document.getElementById('2501.02815v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.20821">arXiv:2412.20821</a> <span> [<a href="https://arxiv.org/pdf/2412.20821">pdf</a>, <a href="https://arxiv.org/format/2412.20821">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Multimodal Emotion Recognition through Multi-Granularity Cross-Modal Alignment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+X">Xuechen Wang</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+S">Shiwan Zhao</a>, <a href="/search/eess?searchtype=author&query=Sun%2C+H">Haoqin Sun</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+H">Hui Wang</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+J">Jiaming Zhou</a>, <a href="/search/eess?searchtype=author&query=Qin%2C+Y">Yong Qin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.20821v1-abstract-short" style="display: inline;"> Multimodal emotion recognition (MER), leveraging speech and text, has emerged as a pivotal domain within human-computer interaction, demanding sophisticated methods for effective multimodal integration. The challenge of aligning features across these modalities is significant, with most existing approaches adopting a singular alignment strategy. Such a narrow focus not only limits model performanc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.20821v1-abstract-full').style.display = 'inline'; document.getElementById('2412.20821v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.20821v1-abstract-full" style="display: none;"> Multimodal emotion recognition (MER), leveraging speech and text, has emerged as a pivotal domain within human-computer interaction, demanding sophisticated methods for effective multimodal integration. The challenge of aligning features across these modalities is significant, with most existing approaches adopting a singular alignment strategy. Such a narrow focus not only limits model performance but also fails to address the complexity and ambiguity inherent in emotional expressions. In response, this paper introduces a Multi-Granularity Cross-Modal Alignment (MGCMA) framework, distinguished by its comprehensive approach encompassing distribution-based, instance-based, and token-based alignment modules. This framework enables a multi-level perception of emotional information across modalities. Our experiments on IEMOCAP demonstrate that our proposed method outperforms current state-of-the-art techniques. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.20821v1-abstract-full').style.display = 'none'; document.getElementById('2412.20821v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICASSP 2025-2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.19099">arXiv:2412.19099</a> <span> [<a href="https://arxiv.org/pdf/2412.19099">pdf</a>, <a href="https://arxiv.org/format/2412.19099">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> BSDB-Net: Band-Split Dual-Branch Network with Selective State Spaces Mechanism for Monaural Speech Enhancement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Fan%2C+C">Cunhang Fan</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+E">Enrui Liu</a>, <a href="/search/eess?searchtype=author&query=Li%2C+A">Andong Li</a>, <a href="/search/eess?searchtype=author&query=Tao%2C+J">Jianhua Tao</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+J">Jian Zhou</a>, <a href="/search/eess?searchtype=author&query=Li%2C+J">Jiahao Li</a>, <a href="/search/eess?searchtype=author&query=Zheng%2C+C">Chengshi Zheng</a>, <a href="/search/eess?searchtype=author&query=Lv%2C+Z">Zhao Lv</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.19099v1-abstract-short" style="display: inline;"> Although the complex spectrum-based speech enhancement(SE) methods have achieved significant performance, coupling amplitude and phase can lead to a compensation effect, where amplitude information is sacrificed to compensate for the phase that is harmful to SE. In addition, to further improve the performance of SE, many modules are stacked onto SE, resulting in increased model complexity that lim… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.19099v1-abstract-full').style.display = 'inline'; document.getElementById('2412.19099v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.19099v1-abstract-full" style="display: none;"> Although the complex spectrum-based speech enhancement(SE) methods have achieved significant performance, coupling amplitude and phase can lead to a compensation effect, where amplitude information is sacrificed to compensate for the phase that is harmful to SE. In addition, to further improve the performance of SE, many modules are stacked onto SE, resulting in increased model complexity that limits the application of SE. To address these problems, we proposed a dual-path network based on compressed frequency using Mamba. First, we extract amplitude and phase information through parallel dual branches. This approach leverages structured complex spectra to implicitly capture phase information and solves the compensation effect by decoupling amplitude and phase, and the network incorporates an interaction module to suppress unnecessary parts and recover missing components from the other branch. Second, to reduce network complexity, the network introduces a band-split strategy to compress the frequency dimension. To further reduce complexity while maintaining good performance, we designed a Mamba-based module that models the time and frequency dimensions under linear complexity. Finally, compared to baselines, our model achieves an average 8.3 times reduction in computational complexity while maintaining superior performance. Furthermore, it achieves a 25 times reduction in complexity compared to transformer-based models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.19099v1-abstract-full').style.display = 'none'; document.getElementById('2412.19099v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by AAAI 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.18417">arXiv:2412.18417</a> <span> [<a href="https://arxiv.org/pdf/2412.18417">pdf</a>, <a href="https://arxiv.org/format/2412.18417">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Ultra-Low Complexity On-Orbit Compression for Remote Sensing Imagery via Block Modulated Imaging </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+Z">Zhibin Wang</a>, <a href="/search/eess?searchtype=author&query=Cai%2C+Y">Yanxin Cai</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+J">Jiayi Zhou</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Y">Yangming Zhang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+T">Tianyu Li</a>, <a href="/search/eess?searchtype=author&query=Li%2C+W">Wei Li</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+X">Xun Liu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+G">Guoqing Wang</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+Y">Yang Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.18417v1-abstract-short" style="display: inline;"> The growing field of remote sensing faces a challenge: the ever-increasing size and volume of imagery data are exceeding the storage and transmission capabilities of satellite platforms. Efficient compression of remote sensing imagery is a critical solution to alleviate these burdens on satellites. However, existing compression methods are often too computationally expensive for satellites. With t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18417v1-abstract-full').style.display = 'inline'; document.getElementById('2412.18417v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.18417v1-abstract-full" style="display: none;"> The growing field of remote sensing faces a challenge: the ever-increasing size and volume of imagery data are exceeding the storage and transmission capabilities of satellite platforms. Efficient compression of remote sensing imagery is a critical solution to alleviate these burdens on satellites. However, existing compression methods are often too computationally expensive for satellites. With the continued advancement of compressed sensing theory, single-pixel imaging emerges as a powerful tool that brings new possibilities for on-orbit image compression. However, it still suffers from prolonged imaging times and the inability to perform high-resolution imaging, hindering its practical application. This paper advances the study of compressed sensing in remote sensing image compression, proposing Block Modulated Imaging (BMI). By requiring only a single exposure, BMI significantly enhances imaging acquisition speeds. Additionally, BMI obviates the need for digital micromirror devices and surpasses limitations in image resolution. Furthermore, we propose a novel decoding network specifically designed to reconstruct images compressed under the BMI framework. Leveraging the gated 3D convolutions and promoting efficient information flow across stages through a Two-Way Cross-Attention module, our decoding network exhibits demonstrably superior reconstruction performance. Extensive experiments conducted on multiple renowned remote sensing datasets unequivocally demonstrate the efficacy of our proposed method. To further validate its practical applicability, we developed and tested a prototype of the BMI-based camera, which has shown promising potential for on-orbit image compression. The code is available at https://github.com/Johnathan218/BMNet. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18417v1-abstract-full').style.display = 'none'; document.getElementById('2412.18417v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.17062">arXiv:2412.17062</a> <span> [<a href="https://arxiv.org/pdf/2412.17062">pdf</a>, <a href="https://arxiv.org/ps/2412.17062">ps</a>, <a href="https://arxiv.org/format/2412.17062">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Hybrid Beamforming Design for RSMA-enabled Near-Field Integrated Sensing and Communications </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhou%2C+J">Jiasi Zhou</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+C">Cong Zhou</a>, <a href="/search/eess?searchtype=author&query=Tellambura%2C+C">Chintha Tellambura</a>, <a href="/search/eess?searchtype=author&query=Li%2C+G+Y">Geoffrey Ye Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.17062v1-abstract-short" style="display: inline;"> To enable high data rates and sensing resolutions, integrated sensing and communication (ISAC) networks leverage extremely large antenna arrays and high frequencies, extending the Rayleigh distance and making near-field (NF) spherical wave propagation dominant. This unlocks numerous spatial degrees of freedom, raising the challenge of optimizing them for communication and sensing tradeoffs. To thi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.17062v1-abstract-full').style.display = 'inline'; document.getElementById('2412.17062v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.17062v1-abstract-full" style="display: none;"> To enable high data rates and sensing resolutions, integrated sensing and communication (ISAC) networks leverage extremely large antenna arrays and high frequencies, extending the Rayleigh distance and making near-field (NF) spherical wave propagation dominant. This unlocks numerous spatial degrees of freedom, raising the challenge of optimizing them for communication and sensing tradeoffs. To this end, we propose a rate-splitting multiple access (RSMA)-based NF-ISAC transmit scheme utilizing hybrid digital-analog antennas. RSMA enhances interference management, while a variable number of dedicated sensing beams adds beamforming flexibility. The objective is to maximize the minimum communication rate while ensuring multi-target sensing performance by jointly optimizing receive filters, analog and digital beamformers, common rate allocation, and the sensing beam count. To address uncertainty in sensing beam allocation, a rank-zero solution reconstruction method demonstrates that dedicated sensing beams are unnecessary for NF multi-target detection. A penalty dual decomposition (PDD)-based double-loop algorithm is introduced, employing weighted minimum mean-squared error (WMMSE) and quadratic transforms to reformulate communication and sensing rates. Simulations reveal that the proposed scheme: 1) Achieves performance comparable to fully digital beamforming with fewer RF chains, (2) Maintains NF multi-target detection without compromising communication rates, and 3) Significantly outperforms space division multiple access (SDMA) and far-field ISAC systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.17062v1-abstract-full').style.display = 'none'; document.getElementById('2412.17062v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages and 7 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.10117">arXiv:2412.10117</a> <span> [<a href="https://arxiv.org/pdf/2412.10117">pdf</a>, <a href="https://arxiv.org/format/2412.10117">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> CosyVoice 2: Scalable Streaming Speech Synthesis with Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Du%2C+Z">Zhihao Du</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yuxuan Wang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Q">Qian Chen</a>, <a href="/search/eess?searchtype=author&query=Shi%2C+X">Xian Shi</a>, <a href="/search/eess?searchtype=author&query=Lv%2C+X">Xiang Lv</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+T">Tianyu Zhao</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+Z">Zhifu Gao</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+Y">Yexin Yang</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+C">Changfeng Gao</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+H">Hui Wang</a>, <a href="/search/eess?searchtype=author&query=Yu%2C+F">Fan Yu</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+H">Huadai Liu</a>, <a href="/search/eess?searchtype=author&query=Sheng%2C+Z">Zhengyan Sheng</a>, <a href="/search/eess?searchtype=author&query=Gu%2C+Y">Yue Gu</a>, <a href="/search/eess?searchtype=author&query=Deng%2C+C">Chong Deng</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+W">Wen Wang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+S">Shiliang Zhang</a>, <a href="/search/eess?searchtype=author&query=Yan%2C+Z">Zhijie Yan</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+J">Jingren Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.10117v3-abstract-short" style="display: inline;"> In our previous work, we introduced CosyVoice, a multilingual speech synthesis model based on supervised discrete speech tokens. By employing progressive semantic decoding with two popular generative models, language models (LMs) and Flow Matching, CosyVoice demonstrated high prosody naturalness, content consistency, and speaker similarity in speech in-context learning. Recently, significant progr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.10117v3-abstract-full').style.display = 'inline'; document.getElementById('2412.10117v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.10117v3-abstract-full" style="display: none;"> In our previous work, we introduced CosyVoice, a multilingual speech synthesis model based on supervised discrete speech tokens. By employing progressive semantic decoding with two popular generative models, language models (LMs) and Flow Matching, CosyVoice demonstrated high prosody naturalness, content consistency, and speaker similarity in speech in-context learning. Recently, significant progress has been made in multi-modal large language models (LLMs), where the response latency and real-time factor of speech synthesis play a crucial role in the interactive experience. Therefore, in this report, we present an improved streaming speech synthesis model, CosyVoice 2, which incorporates comprehensive and systematic optimizations. Specifically, we introduce finite-scalar quantization to improve the codebook utilization of speech tokens. For the text-speech LM, we streamline the model architecture to allow direct use of a pre-trained LLM as the backbone. In addition, we develop a chunk-aware causal flow matching model to support various synthesis scenarios, enabling both streaming and non-streaming synthesis within a single model. By training on a large-scale multilingual dataset, CosyVoice 2 achieves human-parity naturalness, minimal response latency, and virtually lossless synthesis quality in the streaming mode. We invite readers to listen to the demos at https://funaudiollm.github.io/cosyvoice2. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.10117v3-abstract-full').style.display = 'none'; document.getElementById('2412.10117v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Tech report, work in progress</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.08210">arXiv:2412.08210</a> <span> [<a href="https://arxiv.org/pdf/2412.08210">pdf</a>, <a href="https://arxiv.org/format/2412.08210">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Unicorn: Unified Neural Image Compression with One Number Reconstruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zheng%2C+Q">Qi Zheng</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+H">Haozhi Wang</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+Z">Zihao Liu</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+J">Jiaming Liu</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+P">Peiye Liu</a>, <a href="/search/eess?searchtype=author&query=Hao%2C+Z">Zhijian Hao</a>, <a href="/search/eess?searchtype=author&query=Lu%2C+Y">Yanheng Lu</a>, <a href="/search/eess?searchtype=author&query=Niu%2C+D">Dimin Niu</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+J">Jinjia Zhou</a>, <a href="/search/eess?searchtype=author&query=Jing%2C+M">Minge Jing</a>, <a href="/search/eess?searchtype=author&query=Fan%2C+Y">Yibo Fan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.08210v1-abstract-short" style="display: inline;"> Prevalent lossy image compression schemes can be divided into: 1) explicit image compression (EIC), including traditional standards and neural end-to-end algorithms; 2) implicit image compression (IIC) based on implicit neural representations (INR). The former is encountering impasses of either leveling off bitrate reduction at a cost of tremendous complexity while the latter suffers from excessiv… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.08210v1-abstract-full').style.display = 'inline'; document.getElementById('2412.08210v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.08210v1-abstract-full" style="display: none;"> Prevalent lossy image compression schemes can be divided into: 1) explicit image compression (EIC), including traditional standards and neural end-to-end algorithms; 2) implicit image compression (IIC) based on implicit neural representations (INR). The former is encountering impasses of either leveling off bitrate reduction at a cost of tremendous complexity while the latter suffers from excessive smoothing quality as well as lengthy decoder models. In this paper, we propose an innovative paradigm, which we dub \textbf{Unicorn} (\textbf{U}nified \textbf{N}eural \textbf{I}mage \textbf{C}ompression with \textbf{O}ne \textbf{N}number \textbf{R}econstruction). By conceptualizing the images as index-image pairs and learning the inherent distribution of pairs in a subtle neural network model, Unicorn can reconstruct a visually pleasing image from a randomly generated noise with only one index number. The neural model serves as the unified decoder of images while the noises and indexes corresponds to explicit representations. As a proof of concept, we propose an effective and efficient prototype of Unicorn based on latent diffusion models with tailored model designs. Quantitive and qualitative experimental results demonstrate that our prototype achieves significant bitrates reduction compared with EIC and IIC algorithms. More impressively, benefitting from the unified decoder, our compression ratio escalates as the quantity of images increases. We envision that more advanced model designs will endow Unicorn with greater potential in image compression. We will release our codes in \url{https://github.com/uniqzheng/Unicorn-Laduree}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.08210v1-abstract-full').style.display = 'none'; document.getElementById('2412.08210v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.05174">arXiv:2412.05174</a> <span> [<a href="https://arxiv.org/pdf/2412.05174">pdf</a>, <a href="https://arxiv.org/format/2412.05174">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Methodology">stat.ME</span> </div> </div> <p class="title is-5 mathjax"> Compound Gaussian Radar Clutter Model With Positive Tempered Alpha-Stable Texture </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Liao%2C+X">Xingxing Liao</a>, <a href="/search/eess?searchtype=author&query=Xie%2C+J">Junhao Xie</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+J">Jie Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.05174v1-abstract-short" style="display: inline;"> The compound Gaussian (CG) family of distributions has achieved great success in modeling sea clutter. This work develops a flexible-tailed CG model to improve generality in clutter modeling, by introducing the positive tempered $伪$-stable (PT$伪$S) distribution to model clutter texture. The PT$伪$S distribution exhibits widely tunable tails by tempering the heavy tails of the positive $伪$-stable (P… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05174v1-abstract-full').style.display = 'inline'; document.getElementById('2412.05174v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.05174v1-abstract-full" style="display: none;"> The compound Gaussian (CG) family of distributions has achieved great success in modeling sea clutter. This work develops a flexible-tailed CG model to improve generality in clutter modeling, by introducing the positive tempered $伪$-stable (PT$伪$S) distribution to model clutter texture. The PT$伪$S distribution exhibits widely tunable tails by tempering the heavy tails of the positive $伪$-stable (P$伪$S) distribution, thus providing greater flexibility in texture modeling. Specifically, we first develop a bivariate isotropic CG-PT$伪$S complex clutter model that is defined by an explicit characteristic function, based on which the corresponding amplitude model is derived. Then, we prove that the amplitude model can be expressed as a scale mixture of Rayleighs, just as the successful compound K and Pareto models. Furthermore, a characteristic function-based method is developed to estimate the parameters of the amplitude model. Finally, real-world sea clutter data analysis indicates the amplitude model's flexibility in modeling clutter data with various tail behaviors. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05174v1-abstract-full').style.display = 'none'; document.getElementById('2412.05174v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages, 4 figures, 2 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.01425">arXiv:2412.01425</a> <span> [<a href="https://arxiv.org/pdf/2412.01425">pdf</a>, <a href="https://arxiv.org/format/2412.01425">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Reject Threshold Adaptation for Open-Set Model Attribution of Deepfake Audio </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yan%2C+X">Xinrui Yan</a>, <a href="/search/eess?searchtype=author&query=Yi%2C+J">Jiangyan Yi</a>, <a href="/search/eess?searchtype=author&query=Tao%2C+J">Jianhua Tao</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yujie Chen</a>, <a href="/search/eess?searchtype=author&query=Gu%2C+H">Hao Gu</a>, <a href="/search/eess?searchtype=author&query=Li%2C+G">Guanjun Li</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+J">Junzuo Zhou</a>, <a href="/search/eess?searchtype=author&query=Ren%2C+Y">Yong Ren</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+T">Tao Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.01425v1-abstract-short" style="display: inline;"> Open environment oriented open set model attribution of deepfake audio is an emerging research topic, aiming to identify the generation models of deepfake audio. Most previous work requires manually setting a rejection threshold for unknown classes to compare with predicted probabilities. However, models often overfit training instances and generate overly confident predictions. Moreover, threshol… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.01425v1-abstract-full').style.display = 'inline'; document.getElementById('2412.01425v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.01425v1-abstract-full" style="display: none;"> Open environment oriented open set model attribution of deepfake audio is an emerging research topic, aiming to identify the generation models of deepfake audio. Most previous work requires manually setting a rejection threshold for unknown classes to compare with predicted probabilities. However, models often overfit training instances and generate overly confident predictions. Moreover, thresholds that effectively distinguish unknown categories in the current dataset may not be suitable for identifying known and unknown categories in another data distribution. To address the issues, we propose a novel framework for open set model attribution of deepfake audio with rejection threshold adaptation (ReTA). Specifically, the reconstruction error learning module trains by combining the representation of system fingerprints with labels corresponding to either the target class or a randomly chosen other class label. This process generates matching and non-matching reconstructed samples, establishing the reconstruction error distributions for each class and laying the foundation for the reject threshold calculation module. The reject threshold calculation module utilizes gaussian probability estimation to fit the distributions of matching and non-matching reconstruction errors. It then computes adaptive reject thresholds for all classes through probability minimization criteria. The experimental results demonstrate the effectiveness of ReTA in improving the open set model attributes of deepfake audio. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.01425v1-abstract-full').style.display = 'none'; document.getElementById('2412.01425v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ISCSLP 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.00632">arXiv:2412.00632</a> <span> [<a href="https://arxiv.org/pdf/2412.00632">pdf</a>, <a href="https://arxiv.org/ps/2412.00632">ps</a>, <a href="https://arxiv.org/format/2412.00632">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Flexible Rate-Splitting Multiple Access for Near-Field Integrated Sensing and Communications </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhou%2C+J">Jiasi Zhou</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+C">Cong Zhou</a>, <a href="/search/eess?searchtype=author&query=Zeng%2C+C">Cheng Zeng</a>, <a href="/search/eess?searchtype=author&query=Tellambura%2C+C">Chintha Tellambura</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.00632v1-abstract-short" style="display: inline;"> This letter presents a flexible rate-splitting multiple access (RSMA) framework for near-field (NF) integrated sensing and communications (ISAC). The spatial beams configured to meet the communication rate requirements of NF users are simultaneously leveraged to sense an additional NF target. A key innovation lies in its flexibility to select a subset of users for decoding the common stream, enhan… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.00632v1-abstract-full').style.display = 'inline'; document.getElementById('2412.00632v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.00632v1-abstract-full" style="display: none;"> This letter presents a flexible rate-splitting multiple access (RSMA) framework for near-field (NF) integrated sensing and communications (ISAC). The spatial beams configured to meet the communication rate requirements of NF users are simultaneously leveraged to sense an additional NF target. A key innovation lies in its flexibility to select a subset of users for decoding the common stream, enhancing interference management and system performance. The system is designed by minimizing the Cram茅r-Rao bound (CRB) for joint distance and angle estimation through optimized power allocation, common rate allocation, and user selection. This leads to a discrete, non-convex optimization problem. Remarkably, we demonstrate that the preconfigured beams are sufficient for target sensing, eliminating the need for additional probing signals. To solve the optimization problem, an iterative algorithm is proposed combining the quadratic transform and simulated annealing. Simulation results indicate that the proposed scheme significantly outperforms conventional RSMA and space division multiple access (SDMA), reducing distance and angle estimation errors by approximately 100\% and 20\%, respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.00632v1-abstract-full').style.display = 'none'; document.getElementById('2412.00632v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages and 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.00487">arXiv:2412.00487</a> <span> [<a href="https://arxiv.org/pdf/2412.00487">pdf</a>, <a href="https://arxiv.org/ps/2412.00487">ps</a>, <a href="https://arxiv.org/format/2412.00487">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Joint Beam Scheduling and Resource Allocation for Flexible RSMA-aided Near-Field Communications </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhou%2C+J">Jiasi Zhou</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+C">Cong Zhou</a>, <a href="/search/eess?searchtype=author&query=Mao%2C+Y">Yijie Mao</a>, <a href="/search/eess?searchtype=author&query=Tellambura%2C+C">Chintha Tellambura</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.00487v1-abstract-short" style="display: inline;"> Supporting immense throughput and ubiquitous connectivity holds paramount importance for future wireless networks. To this end, this letter focuses on how the spatial beams configured for legacy near-field (NF) users can be leveraged to serve extra NF or far-field users while ensuring the rate requirements of legacy NF users. In particular, a flexible rate splitting multiple access (RSMA) scheme i… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.00487v1-abstract-full').style.display = 'inline'; document.getElementById('2412.00487v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.00487v1-abstract-full" style="display: none;"> Supporting immense throughput and ubiquitous connectivity holds paramount importance for future wireless networks. To this end, this letter focuses on how the spatial beams configured for legacy near-field (NF) users can be leveraged to serve extra NF or far-field users while ensuring the rate requirements of legacy NF users. In particular, a flexible rate splitting multiple access (RSMA) scheme is proposed to efficiently manage interference, which carefully selects a subset of legacy users to decode the common stream. Beam scheduling, power allocation, common rate allocation, and user selection are jointly optimized to maximize the sum rate of additional users. To solve the formulated discrete non-convex problem, it is split into three subproblems. The accelerated bisection searching, quadratic transform, and simulated annealing approaches are developed to attack them. Simulation results reveal that the proposed transmit scheme and algorithm achieve significant gains over three competing benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.00487v1-abstract-full').style.display = 'none'; document.getElementById('2412.00487v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages and 4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.00302">arXiv:2412.00302</a> <span> [<a href="https://arxiv.org/pdf/2412.00302">pdf</a>, <a href="https://arxiv.org/format/2412.00302">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> HSLiNets: Hyperspectral Image and LiDAR Data Fusion Using Efficient Dual Non-Linear Feature Learning Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yang%2C+J+X">Judy X Yang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+J">Jing Wang</a>, <a href="/search/eess?searchtype=author&query=Sui%2C+C+H">Chen Hong Sui</a>, <a href="/search/eess?searchtype=author&query=Long%2C+Z">Zekun Long</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+J">Jun Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.00302v2-abstract-short" style="display: inline;"> The integration of hyperspectral imaging (HSI) and LiDAR data within new linear feature spaces offers a promising solution to the challenges posed by the high-dimensionality and redundancy inherent in HSIs. This study introduces a dual linear fused space framework that capitalizes on bidirectional reversed convolutional neural network (CNN) pathways, coupled with a specialized spatial analysis blo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.00302v2-abstract-full').style.display = 'inline'; document.getElementById('2412.00302v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.00302v2-abstract-full" style="display: none;"> The integration of hyperspectral imaging (HSI) and LiDAR data within new linear feature spaces offers a promising solution to the challenges posed by the high-dimensionality and redundancy inherent in HSIs. This study introduces a dual linear fused space framework that capitalizes on bidirectional reversed convolutional neural network (CNN) pathways, coupled with a specialized spatial analysis block. This approach combines the computational efficiency of CNNs with the adaptability of attention mechanisms, facilitating the effective fusion of spectral and spatial information. The proposed method not only enhances data processing and classification accuracy, but also mitigates the computational burden typically associated with advanced models such as Transformers. Evaluations of the Houston 2013 dataset demonstrate that our approach surpasses existing state-of-the-art models. This advancement underscores the potential of the framework in resource-constrained environments and its significant contributions to the field of remote sensing. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.00302v2-abstract-full').style.display = 'none'; document.getElementById('2412.00302v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 2 figues</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> F.2.2; I; 2.7 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10031">arXiv:2411.10031</a> <span> [<a href="https://arxiv.org/pdf/2411.10031">pdf</a>, <a href="https://arxiv.org/format/2411.10031">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Enforcing Cooperative Safety for Reinforcement Learning-based Mixed-Autonomy Platoon Control </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhou%2C+J">Jingyuan Zhou</a>, <a href="/search/eess?searchtype=author&query=Yan%2C+L">Longhao Yan</a>, <a href="/search/eess?searchtype=author&query=Liang%2C+J">Jinhao Liang</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+K">Kaidi Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10031v1-abstract-short" style="display: inline;"> It is recognized that the control of mixed-autonomy platoons comprising connected and automated vehicles (CAVs) and human-driven vehicles (HDVs) can enhance traffic flow. Among existing methods, Multi-Agent Reinforcement Learning (MARL) appears to be a promising control strategy because it can manage complex scenarios in real time. However, current research on MARL-based mixed-autonomy platoon con… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10031v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10031v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10031v1-abstract-full" style="display: none;"> It is recognized that the control of mixed-autonomy platoons comprising connected and automated vehicles (CAVs) and human-driven vehicles (HDVs) can enhance traffic flow. Among existing methods, Multi-Agent Reinforcement Learning (MARL) appears to be a promising control strategy because it can manage complex scenarios in real time. However, current research on MARL-based mixed-autonomy platoon control suffers from several limitations. First, existing MARL approaches address safety by penalizing safety violations in the reward function, thus lacking theoretical safety guarantees due to the black-box nature of RL. Second, few studies have explored the cooperative safety of multi-CAV platoons, where CAVs can be coordinated to further enhance the system-level safety involving the safety of both CAVs and HDVs. Third, existing work tends to make an unrealistic assumption that the behavior of HDVs and CAVs is publicly known and rationale. To bridge the research gaps, we propose a safe MARL framework for mixed-autonomy platoons. Specifically, this framework (i) characterizes cooperative safety by designing a cooperative Control Barrier Function (CBF), enabling CAVs to collaboratively improve the safety of the entire platoon, (ii) provides a safety guarantee to the MARL-based controller by integrating the CBF-based safety constraints into MARL through a differentiable quadratic programming (QP) layer, and (iii) incorporates a conformal prediction module that enables each CAV to estimate the unknown behaviors of the surrounding vehicles with uncertainty qualification. Simulation results show that our proposed control strategy can effectively enhance the system-level safety through CAV cooperation of a mixed-autonomy platoon with a minimal impact on control performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10031v1-abstract-full').style.display = 'none'; document.getElementById('2411.10031v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07500">arXiv:2411.07500</a> <span> [<a href="https://arxiv.org/pdf/2411.07500">pdf</a>, <a href="https://arxiv.org/format/2411.07500">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> MaDiNet: Mamba Diffusion Network for SAR Target Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhou%2C+J">Jie Zhou</a>, <a href="/search/eess?searchtype=author&query=Xiao%2C+C">Chao Xiao</a>, <a href="/search/eess?searchtype=author&query=Peng%2C+B">Bowen Peng</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+T">Tianpeng Liu</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+Z">Zhen Liu</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+Y">Yongxiang Liu</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+L">Li Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07500v1-abstract-short" style="display: inline;"> The fundamental challenge in SAR target detection lies in developing discriminative, efficient, and robust representations of target characteristics within intricate non-cooperative environments. However, accurate target detection is impeded by factors including the sparse distribution and discrete features of the targets, as well as complex background interference. In this study, we propose a \te… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07500v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07500v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07500v1-abstract-full" style="display: none;"> The fundamental challenge in SAR target detection lies in developing discriminative, efficient, and robust representations of target characteristics within intricate non-cooperative environments. However, accurate target detection is impeded by factors including the sparse distribution and discrete features of the targets, as well as complex background interference. In this study, we propose a \textbf{Ma}mba \textbf{Di}ffusion \textbf{Net}work (MaDiNet) for SAR target detection. Specifically, MaDiNet conceptualizes SAR target detection as the task of generating the position (center coordinates) and size (width and height) of the bounding boxes in the image space. Furthermore, we design a MambaSAR module to capture intricate spatial structural information of targets and enhance the capability of the model to differentiate between targets and complex backgrounds. The experimental results on extensive SAR target detection datasets achieve SOTA, proving the effectiveness of the proposed network. Code is available at \href{https://github.com/JoyeZLearning/MaDiNet}{https://github.com/JoyeZLearning/MaDiNet}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07500v1-abstract-full').style.display = 'none'; document.getElementById('2411.07500v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20514">arXiv:2410.20514</a> <span> [<a href="https://arxiv.org/pdf/2410.20514">pdf</a>, <a href="https://arxiv.org/format/2410.20514">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Uncertainty-Aware Decision-Making and Planning for Autonomous Forced Merging </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhou%2C+J">Jian Zhou</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+Y">Yulong Gao</a>, <a href="/search/eess?searchtype=author&query=Olofsson%2C+B">Bj枚rn Olofsson</a>, <a href="/search/eess?searchtype=author&query=Frisk%2C+E">Erik Frisk</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20514v1-abstract-short" style="display: inline;"> In this paper, we develop an uncertainty-aware decision-making and motion-planning method for an autonomous ego vehicle in forced merging scenarios, considering the motion uncertainty of surrounding vehicles. The method dynamically captures the uncertainty of surrounding vehicles by online estimation of their acceleration bounds, enabling a reactive but rapid understanding of the uncertainty chara… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20514v1-abstract-full').style.display = 'inline'; document.getElementById('2410.20514v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20514v1-abstract-full" style="display: none;"> In this paper, we develop an uncertainty-aware decision-making and motion-planning method for an autonomous ego vehicle in forced merging scenarios, considering the motion uncertainty of surrounding vehicles. The method dynamically captures the uncertainty of surrounding vehicles by online estimation of their acceleration bounds, enabling a reactive but rapid understanding of the uncertainty characteristics of the surrounding vehicles. By leveraging these estimated bounds, a non-conservative forward occupancy of surrounding vehicles is predicted over a horizon, which is incorporated in both the decision-making process and the motion-planning strategy, to enhance the resilience and safety of the planned reference trajectory. The method successfully fulfills the tasks in challenging forced merging scenarios, and the properties are illustrated by comparison with several alternative approaches. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20514v1-abstract-full').style.display = 'none'; document.getElementById('2410.20514v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by the 63rd IEEE Conference on Decision and Control, 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20236">arXiv:2410.20236</a> <span> [<a href="https://arxiv.org/pdf/2410.20236">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Medical Physics">physics.med-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Photon-Counting CT in Cancer Radiotherapy: Technological Advances and Clinical Benefits </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Shah%2C+K+D">Keyur D. Shah</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+J">Jun Zhou</a>, <a href="/search/eess?searchtype=author&query=Roper%2C+J">Justin Roper</a>, <a href="/search/eess?searchtype=author&query=Dhabaan%2C+A">Anees Dhabaan</a>, <a href="/search/eess?searchtype=author&query=Al-Hallaq%2C+H">Hania Al-Hallaq</a>, <a href="/search/eess?searchtype=author&query=Pourmorteza%2C+A">Amir Pourmorteza</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+X">Xiaofeng Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20236v3-abstract-short" style="display: inline;"> Photon-counting computed tomography (PCCT) marks a significant advancement over conventional energy-integrating detector (EID) CT systems. This review highlights PCCT's superior spatial and contrast resolution, reduced radiation dose, and multi-energy imaging capabilities, which address key challenges in radiotherapy, such as accurate tumor delineation, precise dose calculation, and treatment resp… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20236v3-abstract-full').style.display = 'inline'; document.getElementById('2410.20236v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20236v3-abstract-full" style="display: none;"> Photon-counting computed tomography (PCCT) marks a significant advancement over conventional energy-integrating detector (EID) CT systems. This review highlights PCCT's superior spatial and contrast resolution, reduced radiation dose, and multi-energy imaging capabilities, which address key challenges in radiotherapy, such as accurate tumor delineation, precise dose calculation, and treatment response monitoring. PCCT's improved anatomical clarity enhances tumor targeting while minimizing damage to surrounding healthy tissues. Additionally, metal artifact reduction (MAR) and quantitative imaging capabilities optimize workflows, enabling adaptive radiotherapy and radiomics-driven personalized treatment. Emerging clinical applications in brachytherapy and radiopharmaceutical therapy (RPT) show promising outcomes, although challenges like high costs and limited software integration remain. With advancements in artificial intelligence (AI) and dedicated radiotherapy packages, PCCT is poised to transform precision, safety, and efficacy in cancer radiotherapy, marking it as a pivotal technology for future clinical practice. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20236v3-abstract-full').style.display = 'none'; document.getElementById('2410.20236v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.10570">arXiv:2410.10570</a> <span> [<a href="https://arxiv.org/pdf/2410.10570">pdf</a>, <a href="https://arxiv.org/format/2410.10570">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Mindalogue: LLM-Powered Nonlinear Interaction for Effective Learning and Task Exploration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+R">Rui Zhang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Z">Ziyao Zhang</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+F">Fengliang Zhu</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+J">Jiajie Zhou</a>, <a href="/search/eess?searchtype=author&query=Rao%2C+A">Anyi Rao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.10570v2-abstract-short" style="display: inline;"> Current generative AI models like ChatGPT, Claude, and Gemini are widely used for knowledge dissemination, task decomposition, and creative thinking. However, their linear interaction methods often force users to repeatedly compare and copy contextual information when handling complex tasks, increasing cognitive load and operational costs. Moreover, the ambiguity in model responses requires users… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10570v2-abstract-full').style.display = 'inline'; document.getElementById('2410.10570v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.10570v2-abstract-full" style="display: none;"> Current generative AI models like ChatGPT, Claude, and Gemini are widely used for knowledge dissemination, task decomposition, and creative thinking. However, their linear interaction methods often force users to repeatedly compare and copy contextual information when handling complex tasks, increasing cognitive load and operational costs. Moreover, the ambiguity in model responses requires users to refine and simplify the information further. To address these issues, we developed "Mindalogue", a system using a non-linear interaction model based on "nodes + canvas" to enhance user efficiency and freedom while generating structured responses. A formative study with 11 users informed the design of Mindalogue, which was then evaluated through a study with 16 participants. The results showed that Mindalogue significantly reduced task steps and improved users' comprehension of complex information. This study highlights the potential of non-linear interaction in improving AI tool efficiency and user experience in the HCI field. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10570v2-abstract-full').style.display = 'none'; document.getElementById('2410.10570v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages, 9 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68U35(Primary); 68T20(Secondary) <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> H.5.2 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.00703">arXiv:2410.00703</a> <span> [<a href="https://arxiv.org/pdf/2410.00703">pdf</a>, <a href="https://arxiv.org/ps/2410.00703">ps</a>, <a href="https://arxiv.org/format/2410.00703">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Dynamical Systems">math.DS</span> </div> </div> <p class="title is-5 mathjax"> Koopman Spectral Analysis from Noisy Measurements based on Bayesian Learning and Kalman Smoothing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zeng%2C+Z">Zhexuan Zeng</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+J">Jun Zhou</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yasen Wang</a>, <a href="/search/eess?searchtype=author&query=Ping%2C+Z">Zuowei Ping</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.00703v1-abstract-short" style="display: inline;"> Koopman spectral analysis plays a crucial role in understanding and modeling nonlinear dynamical systems as it reveals key system behaviors and long-term dynamics. However, the presence of measurement noise poses a significant challenge to accurately extracting spectral properties. In this work, we propose a robust method for identifying the Koopman operator and extracting its spectral characteris… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00703v1-abstract-full').style.display = 'inline'; document.getElementById('2410.00703v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.00703v1-abstract-full" style="display: none;"> Koopman spectral analysis plays a crucial role in understanding and modeling nonlinear dynamical systems as it reveals key system behaviors and long-term dynamics. However, the presence of measurement noise poses a significant challenge to accurately extracting spectral properties. In this work, we propose a robust method for identifying the Koopman operator and extracting its spectral characteristics in noisy environments. To address the impact of noise, our approach tackles an identification problem that accounts for both systematic errors from finite-dimensional approximations and measurement noise in the data. By incorporating Bayesian learning and Kalman smoothing, the method simultaneously identifies the Koopman operator and estimates system states, effectively decoupling these two error sources. The method's efficiency and robustness are demonstrated through extensive experiments, showcasing its accuracy across varying noise levels. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00703v1-abstract-full').style.display = 'none'; document.getElementById('2410.00703v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.20472">arXiv:2409.20472</a> <span> [<a href="https://arxiv.org/pdf/2409.20472">pdf</a>, <a href="https://arxiv.org/format/2409.20472">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Fluid Antenna-Assisted Near-Field System </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhou%2C+J">Jingxuan Zhou</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+Y">Yinchao Yang</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+Z">Zhaohui Yang</a>, <a href="/search/eess?searchtype=author&query=Shikh-Bahaei%2C+M">Mohammad Shikh-Bahaei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.20472v1-abstract-short" style="display: inline;"> This paper proposes a fluid antenna (FA)-assisted near-field integrated sensing and communications (ISAC) system enabled by the extremely large-scale simultaneously transmitting and reflecting surface (XL-STARS). By optimizing the communication beamformer, the sensing signal covariance matrix, the XL-STARS phase shift, and the FA position vector, the Cram茅r-Rao bound (CRB), as a metric for sensing… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.20472v1-abstract-full').style.display = 'inline'; document.getElementById('2409.20472v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.20472v1-abstract-full" style="display: none;"> This paper proposes a fluid antenna (FA)-assisted near-field integrated sensing and communications (ISAC) system enabled by the extremely large-scale simultaneously transmitting and reflecting surface (XL-STARS). By optimizing the communication beamformer, the sensing signal covariance matrix, the XL-STARS phase shift, and the FA position vector, the Cram茅r-Rao bound (CRB), as a metric for sensing performance, is minimized while ensuring the standard communication performance. A double-loop iterative algorithm based on the penalty dual decomposition (PDD) and block coordinate descent (BCD) methods is proposed to solve the non-convex minimization problem by decomposing it into three subproblems and optimizing the coupling variables for each subproblem iteratively. Simulation results validate the superior performance of the proposed algorithm. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.20472v1-abstract-full').style.display = 'none'; document.getElementById('2409.20472v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.18584">arXiv:2409.18584</a> <span> [<a href="https://arxiv.org/pdf/2409.18584">pdf</a>, <a href="https://arxiv.org/format/2409.18584">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> ChildMandarin: A Comprehensive Mandarin Speech Dataset for Young Children Aged 3-5 </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhou%2C+J">Jiaming Zhou</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+S">Shiyao Wang</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+S">Shiwan Zhao</a>, <a href="/search/eess?searchtype=author&query=He%2C+J">Jiabei He</a>, <a href="/search/eess?searchtype=author&query=Sun%2C+H">Haoqin Sun</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+H">Hui Wang</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+C">Cheng Liu</a>, <a href="/search/eess?searchtype=author&query=Kong%2C+A">Aobo Kong</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+Y">Yujie Guo</a>, <a href="/search/eess?searchtype=author&query=Qin%2C+Y">Yong Qin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.18584v2-abstract-short" style="display: inline;"> Automatic speech recognition (ASR) systems have advanced significantly with models like Whisper, Conformer, and self-supervised frameworks such as Wav2vec 2.0 and HuBERT. However, developing robust ASR models for young children's speech remains challenging due to differences in pronunciation, tone, and pace compared to adult speech. In this paper, we introduce a new Mandarin speech dataset focused… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18584v2-abstract-full').style.display = 'inline'; document.getElementById('2409.18584v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.18584v2-abstract-full" style="display: none;"> Automatic speech recognition (ASR) systems have advanced significantly with models like Whisper, Conformer, and self-supervised frameworks such as Wav2vec 2.0 and HuBERT. However, developing robust ASR models for young children's speech remains challenging due to differences in pronunciation, tone, and pace compared to adult speech. In this paper, we introduce a new Mandarin speech dataset focused on children aged 3 to 5, addressing the scarcity of resources in this area. The dataset comprises 41.25 hours of speech with carefully crafted manual transcriptions, collected from 397 speakers across various provinces in China, with balanced gender representation. We provide a comprehensive analysis of speaker demographics, speech duration distribution and geographic coverage. Additionally, we evaluate ASR performance on models trained from scratch, such as Conformer, as well as fine-tuned pre-trained models like HuBERT and Whisper, where fine-tuning demonstrates significant performance improvements. Furthermore, we assess speaker verification (SV) on our dataset, showing that, despite the challenges posed by the unique vocal characteristics of young children, the dataset effectively supports both ASR and SV tasks. This dataset is a valuable contribution to Mandarin child speech research and holds potential for applications in educational technology and child-computer interaction. It will be open-source and freely available for all academic purposes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18584v2-abstract-full').style.display = 'none'; document.getElementById('2409.18584v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.16322">arXiv:2409.16322</a> <span> [<a href="https://arxiv.org/pdf/2409.16322">pdf</a>, <a href="https://arxiv.org/format/2409.16322">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neurons and Cognition">q-bio.NC</span> </div> </div> <p class="title is-5 mathjax"> Towards Within-Class Variation in Alzheimer's Disease Detection from Spontaneous Speech </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Kang%2C+J">Jiawen Kang</a>, <a href="/search/eess?searchtype=author&query=Han%2C+D">Dongrui Han</a>, <a href="/search/eess?searchtype=author&query=Meng%2C+L">Lingwei Meng</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+J">Jingyan Zhou</a>, <a href="/search/eess?searchtype=author&query=Li%2C+J">Jinchao Li</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+X">Xixin Wu</a>, <a href="/search/eess?searchtype=author&query=Meng%2C+H">Helen Meng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.16322v1-abstract-short" style="display: inline;"> Alzheimer's Disease (AD) detection has emerged as a promising research area that employs machine learning classification models to distinguish between individuals with AD and those without. Unlike conventional classification tasks, we identify within-class variation as a critical challenge in AD detection: individuals with AD exhibit a spectrum of cognitive impairments. Given that many AD detectio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.16322v1-abstract-full').style.display = 'inline'; document.getElementById('2409.16322v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.16322v1-abstract-full" style="display: none;"> Alzheimer's Disease (AD) detection has emerged as a promising research area that employs machine learning classification models to distinguish between individuals with AD and those without. Unlike conventional classification tasks, we identify within-class variation as a critical challenge in AD detection: individuals with AD exhibit a spectrum of cognitive impairments. Given that many AD detection tasks lack fine-grained labels, simplistic binary classification may overlook two crucial aspects: within-class differences and instance-level imbalance. The former compels the model to map AD samples with varying degrees of impairment to a single diagnostic label, disregarding certain changes in cognitive function. While the latter biases the model towards overrepresented severity levels. This work presents early efforts to address these challenges. We propose two novel methods: Soft Target Distillation (SoTD) and Instance-level Re-balancing (InRe), targeting two problems respectively. Experiments on the ADReSS and ADReSSo datasets demonstrate that the proposed methods significantly improve detection accuracy. Further analysis reveals that SoTD effectively harnesses the strengths of multiple component models, while InRe substantially alleviates model over-fitting. These findings provide insights for developing more robust and reliable AD detection models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.16322v1-abstract-full').style.display = 'none'; document.getElementById('2409.16322v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.13832">arXiv:2409.13832</a> <span> [<a href="https://arxiv.org/pdf/2409.13832">pdf</a>, <a href="https://arxiv.org/format/2409.13832">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> GTSinger: A Global Multi-Technique Singing Corpus with Realistic Music Scores for All Singing Tasks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+Y">Yu Zhang</a>, <a href="/search/eess?searchtype=author&query=Pan%2C+C">Changhao Pan</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+W">Wenxiang Guo</a>, <a href="/search/eess?searchtype=author&query=Li%2C+R">Ruiqi Li</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+Z">Zhiyuan Zhu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+J">Jialei Wang</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+W">Wenhao Xu</a>, <a href="/search/eess?searchtype=author&query=Lu%2C+J">Jingyu Lu</a>, <a href="/search/eess?searchtype=author&query=Hong%2C+Z">Zhiqing Hong</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+C">Chuxin Wang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+L">LiChao Zhang</a>, <a href="/search/eess?searchtype=author&query=He%2C+J">Jinzheng He</a>, <a href="/search/eess?searchtype=author&query=Jiang%2C+Z">Ziyue Jiang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yuxin Chen</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+C">Chen Yang</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+J">Jiecheng Zhou</a>, <a href="/search/eess?searchtype=author&query=Cheng%2C+X">Xinyu Cheng</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+Z">Zhou Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.13832v5-abstract-short" style="display: inline;"> The scarcity of high-quality and multi-task singing datasets significantly hinders the development of diverse controllable and personalized singing tasks, as existing singing datasets suffer from low quality, limited diversity of languages and singers, absence of multi-technique information and realistic music scores, and poor task suitability. To tackle these problems, we present GTSinger, a larg… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13832v5-abstract-full').style.display = 'inline'; document.getElementById('2409.13832v5-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.13832v5-abstract-full" style="display: none;"> The scarcity of high-quality and multi-task singing datasets significantly hinders the development of diverse controllable and personalized singing tasks, as existing singing datasets suffer from low quality, limited diversity of languages and singers, absence of multi-technique information and realistic music scores, and poor task suitability. To tackle these problems, we present GTSinger, a large global, multi-technique, free-to-use, high-quality singing corpus with realistic music scores, designed for all singing tasks, along with its benchmarks. Particularly, (1) we collect 80.59 hours of high-quality singing voices, forming the largest recorded singing dataset; (2) 20 professional singers across nine widely spoken languages offer diverse timbres and styles; (3) we provide controlled comparison and phoneme-level annotations of six commonly used singing techniques, helping technique modeling and control; (4) GTSinger offers realistic music scores, assisting real-world musical composition; (5) singing voices are accompanied by manual phoneme-to-audio alignments, global style labels, and 16.16 hours of paired speech for various singing tasks. Moreover, to facilitate the use of GTSinger, we conduct four benchmark experiments: technique-controllable singing voice synthesis, technique recognition, style transfer, and speech-to-singing conversion. The corpus and demos can be found at http://aaronz345.github.io/GTSingerDemo/. We provide the dataset and the code for processing data and conducting benchmarks at https://huggingface.co/datasets/GTSinger/GTSinger and https://github.com/AaronZ345/GTSinger. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13832v5-abstract-full').style.display = 'none'; document.getElementById('2409.13832v5-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by NeurIPS 2024 (Spotlight)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.12121">arXiv:2409.12121</a> <span> [<a href="https://arxiv.org/pdf/2409.12121">pdf</a>, <a href="https://arxiv.org/format/2409.12121">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> WMCodec: End-to-End Neural Speech Codec with Deep Watermarking for Authenticity Verification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhou%2C+J">Junzuo Zhou</a>, <a href="/search/eess?searchtype=author&query=Yi%2C+J">Jiangyan Yi</a>, <a href="/search/eess?searchtype=author&query=Ren%2C+Y">Yong Ren</a>, <a href="/search/eess?searchtype=author&query=Tao%2C+J">Jianhua Tao</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+T">Tao Wang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+C+Y">Chu Yuan Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.12121v3-abstract-short" style="display: inline;"> Recent advances in speech spoofing necessitate stronger verification mechanisms in neural speech codecs to ensure authenticity. Current methods embed numerical watermarks before compression and extract them from reconstructed speech for verification, but face limitations such as separate training processes for the watermark and codec, and insufficient cross-modal information integration, leading t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12121v3-abstract-full').style.display = 'inline'; document.getElementById('2409.12121v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.12121v3-abstract-full" style="display: none;"> Recent advances in speech spoofing necessitate stronger verification mechanisms in neural speech codecs to ensure authenticity. Current methods embed numerical watermarks before compression and extract them from reconstructed speech for verification, but face limitations such as separate training processes for the watermark and codec, and insufficient cross-modal information integration, leading to reduced watermark imperceptibility, extraction accuracy, and capacity. To address these issues, we propose WMCodec, the first neural speech codec to jointly train compression-reconstruction and watermark embedding-extraction in an end-to-end manner, optimizing both imperceptibility and extractability of the watermark. Furthermore, We design an iterative Attention Imprint Unit (AIU) for deeper feature integration of watermark and speech, reducing the impact of quantization noise on the watermark. Experimental results show WMCodec outperforms AudioSeal with Encodec in most quality metrics for watermark imperceptibility and consistently exceeds both AudioSeal with Encodec and reinforced TraceableSpeech in extraction accuracy of watermark. At bandwidth of 6 kbps with a watermark capacity of 16 bps, WMCodec maintains over 99% extraction accuracy under common attacks, demonstrating strong robustness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12121v3-abstract-full').style.display = 'none'; document.getElementById('2409.12121v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.11889">arXiv:2409.11889</a> <span> [<a href="https://arxiv.org/pdf/2409.11889">pdf</a>, <a href="https://arxiv.org/format/2409.11889">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> M2R-Whisper: Multi-stage and Multi-scale Retrieval Augmentation for Enhancing Whisper </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhou%2C+J">Jiaming Zhou</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+S">Shiwan Zhao</a>, <a href="/search/eess?searchtype=author&query=He%2C+J">Jiabei He</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+H">Hui Wang</a>, <a href="/search/eess?searchtype=author&query=Zeng%2C+W">Wenjia Zeng</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yong Chen</a>, <a href="/search/eess?searchtype=author&query=Sun%2C+H">Haoqin Sun</a>, <a href="/search/eess?searchtype=author&query=Kong%2C+A">Aobo Kong</a>, <a href="/search/eess?searchtype=author&query=Qin%2C+Y">Yong Qin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.11889v2-abstract-short" style="display: inline;"> State-of-the-art models like OpenAI's Whisper exhibit strong performance in multilingual automatic speech recognition (ASR), but they still face challenges in accurately recognizing diverse subdialects. In this paper, we propose M2R-whisper, a novel multi-stage and multi-scale retrieval augmentation approach designed to enhance ASR performance in low-resource settings. Building on the principles o… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.11889v2-abstract-full').style.display = 'inline'; document.getElementById('2409.11889v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.11889v2-abstract-full" style="display: none;"> State-of-the-art models like OpenAI's Whisper exhibit strong performance in multilingual automatic speech recognition (ASR), but they still face challenges in accurately recognizing diverse subdialects. In this paper, we propose M2R-whisper, a novel multi-stage and multi-scale retrieval augmentation approach designed to enhance ASR performance in low-resource settings. Building on the principles of in-context learning (ICL) and retrieval-augmented techniques, our method employs sentence-level ICL in the pre-processing stage to harness contextual information, while integrating token-level k-Nearest Neighbors (kNN) retrieval as a post-processing step to further refine the final output distribution. By synergistically combining sentence-level and token-level retrieval strategies, M2R-whisper effectively mitigates various types of recognition errors. Experiments conducted on Mandarin and subdialect datasets, including AISHELL-1 and KeSpeech, demonstrate substantial improvements in ASR accuracy, all achieved without any parameter updates. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.11889v2-abstract-full').style.display = 'none'; document.getElementById('2409.11889v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.10978">arXiv:2409.10978</a> <span> [<a href="https://arxiv.org/pdf/2409.10978">pdf</a>, <a href="https://arxiv.org/format/2409.10978">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Edge-based Denoising Image Compression </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Morita%2C+R">Ryugo Morita</a>, <a href="/search/eess?searchtype=author&query=Nishimura%2C+H">Hitoshi Nishimura</a>, <a href="/search/eess?searchtype=author&query=Watanabe%2C+K">Ko Watanabe</a>, <a href="/search/eess?searchtype=author&query=Dengel%2C+A">Andreas Dengel</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+J">Jinjia Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.10978v1-abstract-short" style="display: inline;"> In recent years, deep learning-based image compression, particularly through generative models, has emerged as a pivotal area of research. Despite significant advancements, challenges such as diminished sharpness and quality in reconstructed images, learning inefficiencies due to mode collapse, and data loss during transmission persist. To address these issues, we propose a novel compression model… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10978v1-abstract-full').style.display = 'inline'; document.getElementById('2409.10978v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.10978v1-abstract-full" style="display: none;"> In recent years, deep learning-based image compression, particularly through generative models, has emerged as a pivotal area of research. Despite significant advancements, challenges such as diminished sharpness and quality in reconstructed images, learning inefficiencies due to mode collapse, and data loss during transmission persist. To address these issues, we propose a novel compression model that incorporates a denoising step with diffusion models, significantly enhancing image reconstruction fidelity by sub-information(e.g., edge and depth) from leveraging latent space. Empirical experiments demonstrate that our model achieves superior or comparable results in terms of image quality and compression efficiency when measured against the existing models. Notably, our model excels in scenarios of partial image loss or excessive noise by introducing an edge estimation network to preserve the integrity of reconstructed images, offering a robust solution to the current limitations of image compression. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10978v1-abstract-full').style.display = 'none'; document.getElementById('2409.10978v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.08665">arXiv:2409.08665</a> <span> [<a href="https://arxiv.org/pdf/2409.08665">pdf</a>, <a href="https://arxiv.org/format/2409.08665">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Agile Decision-Making and Safety-Critical Motion Planning for Emergency Autonomous Vehicles </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Shu%2C+Y">Yiming Shu</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+J">Jingyuan Zhou</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+F">Fu Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.08665v3-abstract-short" style="display: inline;"> Efficiency is critical for autonomous vehicles (AVs), especially for emergency AVs. However, most existing methods focus on regular vehicles, overlooking the distinct strategies required by emergency vehicles to address the challenge of maximizing efficiency while ensuring safety. In this paper, we propose an Integrated Agile Decision-Making with Active and Safety-Critical Motion Planning System (… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08665v3-abstract-full').style.display = 'inline'; document.getElementById('2409.08665v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.08665v3-abstract-full" style="display: none;"> Efficiency is critical for autonomous vehicles (AVs), especially for emergency AVs. However, most existing methods focus on regular vehicles, overlooking the distinct strategies required by emergency vehicles to address the challenge of maximizing efficiency while ensuring safety. In this paper, we propose an Integrated Agile Decision-Making with Active and Safety-Critical Motion Planning System (IDEAM). IDEAM focuses on enabling emergency AVs, such as ambulances, to actively attain efficiency in dense traffic scenarios with safety in mind. Firstly, the speed-centric decision-making algorithm named the long short-term spatio-temporal graph-centric decision-making (LSGM) is given. LSGM comprises conditional depth-first search (C-DFS) for multiple paths generation as well as methods for speed gains and risk evaluation for path selection, which presents a robust algorithm for high efficiency and safety consideration. Secondly, with an output path from LSGM, the motion planner reconsiders environmental conditions to decide constraints states for the final planning stage, among which the lane-probing state is designed for actively attaining spatial and speed advantage. Thirdly, under the Frenet-based model predictive control (MPC) framework with final constraints state and selected path, the safety-critical motion planner employs decoupled discrete control barrier functions (DCBFs) and linearized discrete-time high-order control barrier functions (DHOCBFs) to model the constraints associated with different driving behaviors, making the optimal optimization problem convex. Finally, we extensively validate our system using scenarios from a randomly synthetic dataset, demonstrating its capability to achieve speed benefits and assure safety simultaneously. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08665v3-abstract-full').style.display = 'none'; document.getElementById('2409.08665v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.08191">arXiv:2409.08191</a> <span> [<a href="https://arxiv.org/pdf/2409.08191">pdf</a>, <a href="https://arxiv.org/ps/2409.08191">ps</a>, <a href="https://arxiv.org/format/2409.08191">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Optimal Operation of Distribution System Operator and the Impact of Peer-to-Peer Transactions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Lin%2C+H">Hanyang Lin</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+Y">Ye Guo</a>, <a href="/search/eess?searchtype=author&query=Nazir%2C+F+U">Firdous Ul Nazir</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+J">Jianguo Zhou</a>, <a href="/search/eess?searchtype=author&query=Chung%2C+C+Y">Chi Yung Chung</a>, <a href="/search/eess?searchtype=author&query=Hatziargyriou%2C+N">Nikos Hatziargyriou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.08191v1-abstract-short" style="display: inline;"> Peer-to-peer (P2P) energy trading, commonly recognized as a decentralized approach, has emerged as a popular way to better utilize distributed energy resources (DERs). In order to better manage this user-side decentralized approach from a system operator's point of view, this paper proposes an optimal operation approach for distribution system operators (DSO), comprising internal prosumers who eng… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08191v1-abstract-full').style.display = 'inline'; document.getElementById('2409.08191v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.08191v1-abstract-full" style="display: none;"> Peer-to-peer (P2P) energy trading, commonly recognized as a decentralized approach, has emerged as a popular way to better utilize distributed energy resources (DERs). In order to better manage this user-side decentralized approach from a system operator's point of view, this paper proposes an optimal operation approach for distribution system operators (DSO), comprising internal prosumers who engage in P2P transactions. The DSO is assumed to be a financial neutral entity, holding the responsibility of aggregating the surplus energy and deficit demand of prosumers after their P2P transactions while dispatching DERs and considering network integrity. Impacts of P2P transactions on DSO's optimal operation have been studied. Results indicate that energy matching P2P trading where only the total amount of energy over a given period of time is defined may affect quantities of energy exchanged between the DSO and the wholesale market, but not internal dispatch decisions of the DSO. Different levels of real-time power consistency may lead to different total surpluses in the distribution network. For the real-time power matching P2P trading, as a special case of energy matching P2P trading, the provided energy and total surplus are not affected. In other words, DSO can safely ignore P2P transactions if they follow the format defined in this paper. Case studies verify these conclusions and further demonstrate that P2P trading will not affect physical power flow of the whole system, but the financial distribution between the DSO and prosumers. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08191v1-abstract-full').style.display = 'none'; document.getElementById('2409.08191v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.05430">arXiv:2409.05430</a> <span> [<a href="https://arxiv.org/pdf/2409.05430">pdf</a>, <a href="https://arxiv.org/format/2409.05430">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Findings of the 2024 Mandarin Stuttering Event Detection and Automatic Speech Recognition Challenge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Xue%2C+H">Hongfei Xue</a>, <a href="/search/eess?searchtype=author&query=Gong%2C+R">Rong Gong</a>, <a href="/search/eess?searchtype=author&query=Shao%2C+M">Mingchen Shao</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+X">Xin Xu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+L">Lezhi Wang</a>, <a href="/search/eess?searchtype=author&query=Xie%2C+L">Lei Xie</a>, <a href="/search/eess?searchtype=author&query=Bu%2C+H">Hui Bu</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+J">Jiaming Zhou</a>, <a href="/search/eess?searchtype=author&query=Qin%2C+Y">Yong Qin</a>, <a href="/search/eess?searchtype=author&query=Du%2C+J">Jun Du</a>, <a href="/search/eess?searchtype=author&query=Li%2C+M">Ming Li</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+B">Binbin Zhang</a>, <a href="/search/eess?searchtype=author&query=Jia%2C+B">Bin Jia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.05430v1-abstract-short" style="display: inline;"> The StutteringSpeech Challenge focuses on advancing speech technologies for people who stutter, specifically targeting Stuttering Event Detection (SED) and Automatic Speech Recognition (ASR) in Mandarin. The challenge comprises three tracks: (1) SED, which aims to develop systems for detection of stuttering events; (2) ASR, which focuses on creating robust systems for recognizing stuttered speech;… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05430v1-abstract-full').style.display = 'inline'; document.getElementById('2409.05430v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.05430v1-abstract-full" style="display: none;"> The StutteringSpeech Challenge focuses on advancing speech technologies for people who stutter, specifically targeting Stuttering Event Detection (SED) and Automatic Speech Recognition (ASR) in Mandarin. The challenge comprises three tracks: (1) SED, which aims to develop systems for detection of stuttering events; (2) ASR, which focuses on creating robust systems for recognizing stuttered speech; and (3) Research track for innovative approaches utilizing the provided dataset. We utilizes an open-source Mandarin stuttering dataset AS-70, which has been split into new training and test sets for the challenge. This paper presents the dataset, details the challenge tracks, and analyzes the performance of the top systems, highlighting improvements in detection accuracy and reductions in recognition error rates. Our findings underscore the potential of specialized models and augmentation strategies in developing stuttered speech technologies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05430v1-abstract-full').style.display = 'none'; document.getElementById('2409.05430v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 2 figures, accepted by SLT 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.04799">arXiv:2409.04799</a> <span> [<a href="https://arxiv.org/pdf/2409.04799">pdf</a>, <a href="https://arxiv.org/format/2409.04799">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> PB-LRDWWS System for the SLT 2024 Low-Resource Dysarthria Wake-Up Word Spotting Challenge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+S">Shiyao Wang</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+J">Jiaming Zhou</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+S">Shiwan Zhao</a>, <a href="/search/eess?searchtype=author&query=Qin%2C+Y">Yong Qin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.04799v2-abstract-short" style="display: inline;"> For the SLT 2024 Low-Resource Dysarthria Wake-Up Word Spotting (LRDWWS) Challenge, we introduce the PB-LRDWWS system. This system combines a dysarthric speech content feature extractor for prototype construction with a prototype-based classification method. The feature extractor is a fine-tuned HuBERT model obtained through a three-stage fine-tuning process using cross-entropy loss. This fine-tune… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.04799v2-abstract-full').style.display = 'inline'; document.getElementById('2409.04799v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.04799v2-abstract-full" style="display: none;"> For the SLT 2024 Low-Resource Dysarthria Wake-Up Word Spotting (LRDWWS) Challenge, we introduce the PB-LRDWWS system. This system combines a dysarthric speech content feature extractor for prototype construction with a prototype-based classification method. The feature extractor is a fine-tuned HuBERT model obtained through a three-stage fine-tuning process using cross-entropy loss. This fine-tuned HuBERT extracts features from the target dysarthric speaker's enrollment speech to build prototypes. Classification is achieved by calculating the cosine similarity between the HuBERT features of the target dysarthric speaker's evaluation speech and prototypes. Despite its simplicity, our method demonstrates effectiveness through experimental results. Our system achieves second place in the final Test-B of the LRDWWS Challenge. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.04799v2-abstract-full').style.display = 'none'; document.getElementById('2409.04799v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accept by SLT 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.03179">arXiv:2409.03179</a> <span> [<a href="https://arxiv.org/pdf/2409.03179">pdf</a>, <a href="https://arxiv.org/format/2409.03179">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3664647.3681512">10.1145/3664647.3681512 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Perceptual-Distortion Balanced Image Super-Resolution is a Multi-Objective Optimization Problem </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhu%2C+Q">Qiwen Zhu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yanjie Wang</a>, <a href="/search/eess?searchtype=author&query=Cai%2C+S">Shilv Cai</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+L">Liqun Chen</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+J">Jiahuan Zhou</a>, <a href="/search/eess?searchtype=author&query=Yan%2C+L">Luxin Yan</a>, <a href="/search/eess?searchtype=author&query=Zhong%2C+S">Sheng Zhong</a>, <a href="/search/eess?searchtype=author&query=Zou%2C+X">Xu Zou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.03179v1-abstract-short" style="display: inline;"> Training Single-Image Super-Resolution (SISR) models using pixel-based regression losses can achieve high distortion metrics scores (e.g., PSNR and SSIM), but often results in blurry images due to insufficient recovery of high-frequency details. Conversely, using GAN or perceptual losses can produce sharp images with high perceptual metric scores (e.g., LPIPS), but may introduce artifacts and inco… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.03179v1-abstract-full').style.display = 'inline'; document.getElementById('2409.03179v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.03179v1-abstract-full" style="display: none;"> Training Single-Image Super-Resolution (SISR) models using pixel-based regression losses can achieve high distortion metrics scores (e.g., PSNR and SSIM), but often results in blurry images due to insufficient recovery of high-frequency details. Conversely, using GAN or perceptual losses can produce sharp images with high perceptual metric scores (e.g., LPIPS), but may introduce artifacts and incorrect textures. Balancing these two types of losses can help achieve a trade-off between distortion and perception, but the challenge lies in tuning the loss function weights. To address this issue, we propose a novel method that incorporates Multi-Objective Optimization (MOO) into the training process of SISR models to balance perceptual quality and distortion. We conceptualize the relationship between loss weights and image quality assessment (IQA) metrics as black-box objective functions to be optimized within our Multi-Objective Bayesian Optimization Super-Resolution (MOBOSR) framework. This approach automates the hyperparameter tuning process, reduces overall computational cost, and enables the use of numerous loss functions simultaneously. Extensive experiments demonstrate that MOBOSR outperforms state-of-the-art methods in terms of both perceptual quality and distortion, significantly advancing the perception-distortion Pareto frontier. Our work points towards a new direction for future research on balancing perceptual quality and fidelity in nearly all image restoration tasks. The source code and pretrained models are available at: https://github.com/ZhuKeven/MOBOSR. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.03179v1-abstract-full').style.display = 'none'; document.getElementById('2409.03179v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.00121">arXiv:2409.00121</a> <span> [<a href="https://arxiv.org/pdf/2409.00121">pdf</a>, <a href="https://arxiv.org/format/2409.00121">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> BELT-2: Bootstrapping EEG-to-Language representation alignment for multi-task brain decoding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhou%2C+J">Jinzhao Zhou</a>, <a href="/search/eess?searchtype=author&query=Duan%2C+Y">Yiqun Duan</a>, <a href="/search/eess?searchtype=author&query=Chang%2C+F">Fred Chang</a>, <a href="/search/eess?searchtype=author&query=Do%2C+T">Thomas Do</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yu-Kai Wang</a>, <a href="/search/eess?searchtype=author&query=Lin%2C+C">Chin-Teng Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.00121v1-abstract-short" style="display: inline;"> The remarkable success of large language models (LLMs) across various multi-modality applications is well established. However, integrating large language models with humans, or brain dynamics, remains relatively unexplored. In this paper, we introduce BELT-2, a pioneering multi-task model designed to enhance both encoding and decoding performance from EEG signals. To bolster the quality of the EE… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00121v1-abstract-full').style.display = 'inline'; document.getElementById('2409.00121v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.00121v1-abstract-full" style="display: none;"> The remarkable success of large language models (LLMs) across various multi-modality applications is well established. However, integrating large language models with humans, or brain dynamics, remains relatively unexplored. In this paper, we introduce BELT-2, a pioneering multi-task model designed to enhance both encoding and decoding performance from EEG signals. To bolster the quality of the EEG encoder, BELT-2 is the first work to innovatively 1) adopt byte-pair encoding (BPE)-level EEG-language alignment and 2) integrate multi-task training and decoding in the EEG domain. Inspired by the idea of \textbf{\textit{Bridging the Brain with GPT}}, we further connect the multi-task EEG encoder with LLMs by utilizing prefix-tuning on intermediary output from the EEG encoder. These innovative efforts make BELT-2 a pioneering breakthrough, making it the first work in the field capable of decoding coherent and readable sentences from non-invasive brain signals. Our experiments highlight significant advancements over prior techniques in both quantitative and qualitative measures, achieving a decoding performance with a BLEU-1 score of 52.2\% on the ZuCo dataset. Furthermore, BELT-2 shows a remarkable improvement ranging from 31\% to 162\% on other translation benchmarks. Codes can be accessed via the provided anonymous link~\footnote{https://anonymous.4open.science/r/BELT-2-0048}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00121v1-abstract-full').style.display = 'none'; document.getElementById('2409.00121v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.12829">arXiv:2408.12829</a> <span> [<a href="https://arxiv.org/pdf/2408.12829">pdf</a>, <a href="https://arxiv.org/format/2408.12829">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Uncertainty-Aware Mean Opinion Score Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+H">Hui Wang</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+S">Shiwan Zhao</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+J">Jiaming Zhou</a>, <a href="/search/eess?searchtype=author&query=Zheng%2C+X">Xiguang Zheng</a>, <a href="/search/eess?searchtype=author&query=Sun%2C+H">Haoqin Sun</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+X">Xuechen Wang</a>, <a href="/search/eess?searchtype=author&query=Qin%2C+Y">Yong Qin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.12829v1-abstract-short" style="display: inline;"> Mean Opinion Score (MOS) prediction has made significant progress in specific domains. However, the unstable performance of MOS prediction models across diverse samples presents ongoing challenges in the practical application of these systems. In this paper, we point out that the absence of uncertainty modeling is a significant limitation hindering MOS prediction systems from applying to the real… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12829v1-abstract-full').style.display = 'inline'; document.getElementById('2408.12829v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.12829v1-abstract-full" style="display: none;"> Mean Opinion Score (MOS) prediction has made significant progress in specific domains. However, the unstable performance of MOS prediction models across diverse samples presents ongoing challenges in the practical application of these systems. In this paper, we point out that the absence of uncertainty modeling is a significant limitation hindering MOS prediction systems from applying to the real and open world. We analyze the sources of uncertainty in the MOS prediction task and propose to establish an uncertainty-aware MOS prediction system that models aleatory uncertainty and epistemic uncertainty by heteroscedastic regression and Monte Carlo dropout separately. The experimental results show that the system captures uncertainty well and is capable of performing selective prediction and out-of-domain detection. Such capabilities significantly enhance the practical utility of MOS systems in diverse real and open-world environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12829v1-abstract-full').style.display = 'none'; document.getElementById('2408.12829v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by Interspeech 2024, oral</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.07444">arXiv:2408.07444</a> <span> [<a href="https://arxiv.org/pdf/2408.07444">pdf</a>, <a href="https://arxiv.org/format/2408.07444">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Costal Cartilage Segmentation with Topology Guided Deformable Mamba: Method and Benchmark </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+S">Senmao Wang</a>, <a href="/search/eess?searchtype=author&query=Gong%2C+H">Haifan Gong</a>, <a href="/search/eess?searchtype=author&query=Cui%2C+R">Runmeng Cui</a>, <a href="/search/eess?searchtype=author&query=Wan%2C+B">Boyao Wan</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+Y">Yicheng Liu</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+Z">Zhonglin Hu</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+H">Haiqing Yang</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+J">Jingyang Zhou</a>, <a href="/search/eess?searchtype=author&query=Pan%2C+B">Bo Pan</a>, <a href="/search/eess?searchtype=author&query=Lin%2C+L">Lin Lin</a>, <a href="/search/eess?searchtype=author&query=Jiang%2C+H">Haiyue Jiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.07444v1-abstract-short" style="display: inline;"> Costal cartilage segmentation is crucial to various medical applications, necessitating precise and reliable techniques due to its complex anatomy and the importance of accurate diagnosis and surgical planning. We propose a novel deep learning-based approach called topology-guided deformable Mamba (TGDM) for costal cartilage segmentation. The TGDM is tailored to capture the intricate long-range co… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.07444v1-abstract-full').style.display = 'inline'; document.getElementById('2408.07444v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.07444v1-abstract-full" style="display: none;"> Costal cartilage segmentation is crucial to various medical applications, necessitating precise and reliable techniques due to its complex anatomy and the importance of accurate diagnosis and surgical planning. We propose a novel deep learning-based approach called topology-guided deformable Mamba (TGDM) for costal cartilage segmentation. The TGDM is tailored to capture the intricate long-range costal cartilage relationships. Our method leverages a deformable model that integrates topological priors to enhance the adaptability and accuracy of the segmentation process. Furthermore, we developed a comprehensive benchmark that contains 165 cases for costal cartilage segmentation. This benchmark sets a new standard for evaluating costal cartilage segmentation techniques and provides a valuable resource for future research. Extensive experiments conducted on both in-domain benchmarks and out-of domain test sets demonstrate the superiority of our approach over existing methods, showing significant improvements in segmentation precision and robustness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.07444v1-abstract-full').style.display = 'none'; document.getElementById('2408.07444v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.04967">arXiv:2408.04967</a> <span> [<a href="https://arxiv.org/pdf/2408.04967">pdf</a>, <a href="https://arxiv.org/format/2408.04967">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> ADD 2023: Towards Audio Deepfake Detection and Analysis in the Wild </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yi%2C+J">Jiangyan Yi</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+C+Y">Chu Yuan Zhang</a>, <a href="/search/eess?searchtype=author&query=Tao%2C+J">Jianhua Tao</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+C">Chenglong Wang</a>, <a href="/search/eess?searchtype=author&query=Yan%2C+X">Xinrui Yan</a>, <a href="/search/eess?searchtype=author&query=Ren%2C+Y">Yong Ren</a>, <a href="/search/eess?searchtype=author&query=Gu%2C+H">Hao Gu</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+J">Junzuo Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.04967v3-abstract-short" style="display: inline;"> The growing prominence of the field of audio deepfake detection is driven by its wide range of applications, notably in protecting the public from potential fraud and other malicious activities, prompting the need for greater attention and research in this area. The ADD 2023 challenge goes beyond binary real/fake classification by emulating real-world scenarios, such as the identification of manip… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.04967v3-abstract-full').style.display = 'inline'; document.getElementById('2408.04967v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.04967v3-abstract-full" style="display: none;"> The growing prominence of the field of audio deepfake detection is driven by its wide range of applications, notably in protecting the public from potential fraud and other malicious activities, prompting the need for greater attention and research in this area. The ADD 2023 challenge goes beyond binary real/fake classification by emulating real-world scenarios, such as the identification of manipulated intervals in partially fake audio and determining the source responsible for generating any fake audio, both with real-life implications, notably in audio forensics, law enforcement, and construction of reliable and trustworthy evidence. To further foster research in this area, in this article, we describe the dataset that was used in the fake game, manipulation region location and deepfake algorithm recognition tracks of the challenge. We also focus on the analysis of the technical methodologies by the top-performing participants in each task and note the commonalities and differences in their approaches. Finally, we discuss the current technical limitations as identified through the technical analysis, and provide a roadmap for future research directions. The dataset is available for download at http://addchallenge.cn/downloadADD2023. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.04967v3-abstract-full').style.display = 'none'; document.getElementById('2408.04967v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This work has been submitted to the IEEE for possible publication</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.01738">arXiv:2408.01738</a> <span> [<a href="https://arxiv.org/pdf/2408.01738">pdf</a>, <a href="https://arxiv.org/format/2408.01738">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Adaptive Safety with Control Barrier Functions and Triggered Batch Least-Squares Identifier </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Shen%2C+J">Jiajun Shen</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+W">Wei Wang</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+J">Jing Zhou</a>, <a href="/search/eess?searchtype=author&query=L%C3%BC%2C+J">Jinhu L眉</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.01738v2-abstract-short" style="display: inline;"> In this paper, a triggered Batch Least-Squares Identifier (BaLSI) based adaptive safety control scheme is proposed for uncertain systems with potentially conflicting control objectives and safety constraints. A relaxation term is added to the Quadratic Programs (QP) combining the transformed Control Lyapunov Functions (CLFs) and Control Barrier Functions (CBFs), to mediate the potential conflict.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.01738v2-abstract-full').style.display = 'inline'; document.getElementById('2408.01738v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.01738v2-abstract-full" style="display: none;"> In this paper, a triggered Batch Least-Squares Identifier (BaLSI) based adaptive safety control scheme is proposed for uncertain systems with potentially conflicting control objectives and safety constraints. A relaxation term is added to the Quadratic Programs (QP) combining the transformed Control Lyapunov Functions (CLFs) and Control Barrier Functions (CBFs), to mediate the potential conflict. The existing Lyapunov-based adaptive schemes designed to guarantee specific properties of the Lyapunov functions, may grow unboundedly under the effects of the relaxation term. The adaptive law is designed by processing system inputs and outputs, to avoid unbounded estimates and overparameterization problems in the existing results. A safetytriggered condition is presented, based on which the forward invariant property of the safe set is shown and Zeno behavior can be excluded. Simulation results are presented to demonstrate the effectiveness of the proposed adaptive control scheme. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.01738v2-abstract-full').style.display = 'none'; document.getElementById('2408.01738v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 10 fidures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.00325">arXiv:2408.00325</a> <span> [<a href="https://arxiv.org/pdf/2408.00325">pdf</a>, <a href="https://arxiv.org/format/2408.00325">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Iterative Prototype Refinement for Ambiguous Speech Emotion Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Sun%2C+H">Haoqin Sun</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+S">Shiwan Zhao</a>, <a href="/search/eess?searchtype=author&query=Kong%2C+X">Xiangyu Kong</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+X">Xuechen Wang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+H">Hui Wang</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+J">Jiaming Zhou</a>, <a href="/search/eess?searchtype=author&query=Qin%2C+Y">Yong Qin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.00325v1-abstract-short" style="display: inline;"> Recognizing emotions from speech is a daunting task due to the subtlety and ambiguity of expressions. Traditional speech emotion recognition (SER) systems, which typically rely on a singular, precise emotion label, struggle with this complexity. Therefore, modeling the inherent ambiguity of emotions is an urgent problem. In this paper, we propose an iterative prototype refinement framework (IPR) f… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.00325v1-abstract-full').style.display = 'inline'; document.getElementById('2408.00325v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.00325v1-abstract-full" style="display: none;"> Recognizing emotions from speech is a daunting task due to the subtlety and ambiguity of expressions. Traditional speech emotion recognition (SER) systems, which typically rely on a singular, precise emotion label, struggle with this complexity. Therefore, modeling the inherent ambiguity of emotions is an urgent problem. In this paper, we propose an iterative prototype refinement framework (IPR) for ambiguous SER. IPR comprises two interlinked components: contrastive learning and class prototypes. The former provides an efficient way to obtain high-quality representations of ambiguous samples. The latter are dynamically updated based on ambiguous labels -- the similarity of the ambiguous data to all prototypes. These refined embeddings yield precise pseudo labels, thus reinforcing representation quality. Experimental evaluations conducted on the IEMOCAP dataset validate the superior performance of IPR over state-of-the-art methods, thus proving the effectiveness of our proposed method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.00325v1-abstract-full').style.display = 'none'; document.getElementById('2408.00325v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.20878">arXiv:2407.20878</a> <span> [<a href="https://arxiv.org/pdf/2407.20878">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> S3PET: Semi-supervised Standard-dose PET Image Reconstruction via Dose-aware Token Swap </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Cui%2C+J">Jiaqi Cui</a>, <a href="/search/eess?searchtype=author&query=Zeng%2C+P">Pinxian Zeng</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+Y">Yuanyuan Xu</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+X">Xi Wu</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+J">Jiliu Zhou</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yan Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.20878v1-abstract-short" style="display: inline;"> To acquire high-quality positron emission tomography (PET) images while reducing the radiation tracer dose, numerous efforts have been devoted to reconstructing standard-dose PET (SPET) images from low-dose PET (LPET). However, the success of current fully-supervised approaches relies on abundant paired LPET and SPET images, which are often unavailable in clinic. Moreover, these methods often mix… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.20878v1-abstract-full').style.display = 'inline'; document.getElementById('2407.20878v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.20878v1-abstract-full" style="display: none;"> To acquire high-quality positron emission tomography (PET) images while reducing the radiation tracer dose, numerous efforts have been devoted to reconstructing standard-dose PET (SPET) images from low-dose PET (LPET). However, the success of current fully-supervised approaches relies on abundant paired LPET and SPET images, which are often unavailable in clinic. Moreover, these methods often mix the dose-invariant content with dose level-related dose-specific details during reconstruction, resulting in distorted images. To alleviate these problems, in this paper, we propose a two-stage Semi-Supervised SPET reconstruction framework, namely S3PET, to accommodate the training of abundant unpaired and limited paired SPET and LPET images. Our S3PET involves an un-supervised pre-training stage (Stage I) to extract representations from unpaired images, and a supervised dose-aware reconstruction stage (Stage II) to achieve LPET-to-SPET reconstruction by transferring the dose-specific knowledge between paired images. Specifically, in stage I, two independent dose-specific masked autoencoders (DsMAEs) are adopted to comprehensively understand the unpaired SPET and LPET images. Then, in Stage II, the pre-trained DsMAEs are further finetuned using paired images. To prevent distortions in both content and details, we introduce two elaborate modules, i.e., a dose knowledge decouple module to disentangle the respective dose-specific and dose-invariant knowledge of LPET and SPET, and a dose-specific knowledge learning module to transfer the dose-specific information from SPET to LPET, thereby achieving high-quality SPET reconstruction from LPET images. Experiments on two datasets demonstrate that our S3PET achieves state-of-the-art performance quantitatively and qualitatively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.20878v1-abstract-full').style.display = 'none'; document.getElementById('2407.20878v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.18461">arXiv:2407.18461</a> <span> [<a href="https://arxiv.org/pdf/2407.18461">pdf</a>, <a href="https://arxiv.org/format/2407.18461">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.21437/Interspeech.2024-1360">10.21437/Interspeech.2024-1360 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Enhancing Dysarthric Speech Recognition for Unseen Speakers via Prototype-Based Adaptation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+S">Shiyao Wang</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+S">Shiwan Zhao</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+J">Jiaming Zhou</a>, <a href="/search/eess?searchtype=author&query=Kong%2C+A">Aobo Kong</a>, <a href="/search/eess?searchtype=author&query=Qin%2C+Y">Yong Qin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.18461v1-abstract-short" style="display: inline;"> Dysarthric speech recognition (DSR) presents a formidable challenge due to inherent inter-speaker variability, leading to severe performance degradation when applying DSR models to new dysarthric speakers. Traditional speaker adaptation methodologies typically involve fine-tuning models for each speaker, but this strategy is cost-prohibitive and inconvenient for disabled users, requiring substanti… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.18461v1-abstract-full').style.display = 'inline'; document.getElementById('2407.18461v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.18461v1-abstract-full" style="display: none;"> Dysarthric speech recognition (DSR) presents a formidable challenge due to inherent inter-speaker variability, leading to severe performance degradation when applying DSR models to new dysarthric speakers. Traditional speaker adaptation methodologies typically involve fine-tuning models for each speaker, but this strategy is cost-prohibitive and inconvenient for disabled users, requiring substantial data collection. To address this issue, we introduce a prototype-based approach that markedly improves DSR performance for unseen dysarthric speakers without additional fine-tuning. Our method employs a feature extractor trained with HuBERT to produce per-word prototypes that encapsulate the characteristics of previously unseen speakers. These prototypes serve as the basis for classification. Additionally, we incorporate supervised contrastive learning to refine feature extraction. By enhancing representation quality, we further improve DSR performance, enabling effective personalized DSR. We release our code at https://github.com/NKU-HLT/PB-DSR. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.18461v1-abstract-full').style.display = 'none'; document.getElementById('2407.18461v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted by Interspeech 2024</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> INTERSPEECH 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.10759">arXiv:2407.10759</a> <span> [<a href="https://arxiv.org/pdf/2407.10759">pdf</a>, <a href="https://arxiv.org/format/2407.10759">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Qwen2-Audio Technical Report </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Chu%2C+Y">Yunfei Chu</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+J">Jin Xu</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+Q">Qian Yang</a>, <a href="/search/eess?searchtype=author&query=Wei%2C+H">Haojie Wei</a>, <a href="/search/eess?searchtype=author&query=Wei%2C+X">Xipin Wei</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+Z">Zhifang Guo</a>, <a href="/search/eess?searchtype=author&query=Leng%2C+Y">Yichong Leng</a>, <a href="/search/eess?searchtype=author&query=Lv%2C+Y">Yuanjun Lv</a>, <a href="/search/eess?searchtype=author&query=He%2C+J">Jinzheng He</a>, <a href="/search/eess?searchtype=author&query=Lin%2C+J">Junyang Lin</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+C">Chang Zhou</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+J">Jingren Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.10759v1-abstract-short" style="display: inline;"> We introduce the latest progress of Qwen-Audio, a large-scale audio-language model called Qwen2-Audio, which is capable of accepting various audio signal inputs and performing audio analysis or direct textual responses with regard to speech instructions. In contrast to complex hierarchical tags, we have simplified the pre-training process by utilizing natural language prompts for different data an… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.10759v1-abstract-full').style.display = 'inline'; document.getElementById('2407.10759v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.10759v1-abstract-full" style="display: none;"> We introduce the latest progress of Qwen-Audio, a large-scale audio-language model called Qwen2-Audio, which is capable of accepting various audio signal inputs and performing audio analysis or direct textual responses with regard to speech instructions. In contrast to complex hierarchical tags, we have simplified the pre-training process by utilizing natural language prompts for different data and tasks, and have further expanded the data volume. We have boosted the instruction-following capability of Qwen2-Audio and implemented two distinct audio interaction modes for voice chat and audio analysis. In the voice chat mode, users can freely engage in voice interactions with Qwen2-Audio without text input. In the audio analysis mode, users could provide audio and text instructions for analysis during the interaction. Note that we do not use any system prompts to switch between voice chat and audio analysis modes. Qwen2-Audio is capable of intelligently comprehending the content within audio and following voice commands to respond appropriately. For instance, in an audio segment that simultaneously contains sounds, multi-speaker conversations, and a voice command, Qwen2-Audio can directly understand the command and provide an interpretation and response to the audio. Additionally, DPO has optimized the model's performance in terms of factuality and adherence to desired behavior. According to the evaluation results from AIR-Bench, Qwen2-Audio outperformed previous SOTAs, such as Gemini-1.5-pro, in tests focused on audio-centric instruction-following capabilities. Qwen2-Audio is open-sourced with the aim of fostering the advancement of the multi-modal language community. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.10759v1-abstract-full').style.display = 'none'; document.getElementById('2407.10759v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">https://github.com/QwenLM/Qwen2-Audio. Checkpoints, codes and scripts will be opensoursed soon</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.09029">arXiv:2407.09029</a> <span> [<a href="https://arxiv.org/pdf/2407.09029">pdf</a>, <a href="https://arxiv.org/format/2407.09029">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Emotion Recognition in Incomplete Data: A Novel Cross-Modal Alignment, Reconstruction, and Refinement Framework </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Sun%2C+H">Haoqin Sun</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+S">Shiwan Zhao</a>, <a href="/search/eess?searchtype=author&query=Li%2C+S">Shaokai Li</a>, <a href="/search/eess?searchtype=author&query=Kong%2C+X">Xiangyu Kong</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+X">Xuechen Wang</a>, <a href="/search/eess?searchtype=author&query=Kong%2C+A">Aobo Kong</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+J">Jiaming Zhou</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yong Chen</a>, <a href="/search/eess?searchtype=author&query=Zeng%2C+W">Wenjia Zeng</a>, <a href="/search/eess?searchtype=author&query=Qin%2C+Y">Yong Qin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.09029v1-abstract-short" style="display: inline;"> Multimodal emotion recognition systems rely heavily on the full availability of modalities, suffering significant performance declines when modal data is incomplete. To tackle this issue, we present the Cross-Modal Alignment, Reconstruction, and Refinement (CM-ARR) framework, an innovative approach that sequentially engages in cross-modal alignment, reconstruction, and refinement phases to handle… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.09029v1-abstract-full').style.display = 'inline'; document.getElementById('2407.09029v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.09029v1-abstract-full" style="display: none;"> Multimodal emotion recognition systems rely heavily on the full availability of modalities, suffering significant performance declines when modal data is incomplete. To tackle this issue, we present the Cross-Modal Alignment, Reconstruction, and Refinement (CM-ARR) framework, an innovative approach that sequentially engages in cross-modal alignment, reconstruction, and refinement phases to handle missing modalities and enhance emotion recognition. This framework utilizes unsupervised distribution-based contrastive learning to align heterogeneous modal distributions, reducing discrepancies and modeling semantic uncertainty effectively. The reconstruction phase applies normalizing flow models to transform these aligned distributions and recover missing modalities. The refinement phase employs supervised point-based contrastive learning to disrupt semantic correlations and accentuate emotional traits, thereby enriching the affective content of the reconstructed representations. Extensive experiments on the IEMOCAP and MSP-IMPROV datasets confirm the superior performance of CM-ARR under conditions of both missing and complete modalities. Notably, averaged across six scenarios of missing modalities, CM-ARR achieves absolute improvements of 2.11% in WAR and 2.12% in UAR on the IEMOCAP dataset, and 1.71% and 1.96% in WAR and UAR, respectively, on the MSP-IMPROV dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.09029v1-abstract-full').style.display = 'none'; document.getElementById('2407.09029v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.07720">arXiv:2407.07720</a> <span> [<a href="https://arxiv.org/pdf/2407.07720">pdf</a>, <a href="https://arxiv.org/format/2407.07720">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Exploiting Scale-Variant Attention for Segmenting Small Medical Objects </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Dai%2C+W">Wei Dai</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+R">Rui Liu</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+Z">Zixuan Wu</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+T">Tianyi Wu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+M">Min Wang</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+J">Junxian Zhou</a>, <a href="/search/eess?searchtype=author&query=Yuan%2C+Y">Yixuan Yuan</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+J">Jun Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.07720v4-abstract-short" style="display: inline;"> Early detection and accurate diagnosis can predict the risk of malignant disease transformation, thereby increasing the probability of effective treatment. Identifying mild syndrome with small pathological regions serves as an ominous warning and is fundamental in the early diagnosis of diseases. While deep learning algorithms, particularly convolutional neural networks (CNNs), have shown promise… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.07720v4-abstract-full').style.display = 'inline'; document.getElementById('2407.07720v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.07720v4-abstract-full" style="display: none;"> Early detection and accurate diagnosis can predict the risk of malignant disease transformation, thereby increasing the probability of effective treatment. Identifying mild syndrome with small pathological regions serves as an ominous warning and is fundamental in the early diagnosis of diseases. While deep learning algorithms, particularly convolutional neural networks (CNNs), have shown promise in segmenting medical objects, analyzing small areas in medical images remains challenging. This difficulty arises due to information losses and compression defects from convolution and pooling operations in CNNs, which become more pronounced as the network deepens, especially for small medical objects. To address these challenges, we propose a novel scale-variant attention-based network (SvANet) for accurately segmenting small-scale objects in medical images. The SvANet consists of scale-variant attention, cross-scale guidance, Monte Carlo attention, and vision transformer, which incorporates cross-scale features and alleviates compression artifacts for enhancing the discrimination of small medical objects. Quantitative experimental results demonstrate the superior performance of SvANet, achieving 96.12%, 96.11%, 89.79%, 84.15%, 80.25%, 73.05%, and 72.58% in mean Dice coefficient for segmenting kidney tumors, skin lesions, hepatic tumors, polyps, surgical excision cells, retinal vasculatures, and sperms, which occupy less than 1% of the image areas in KiTS23, ISIC 2018, ATLAS, PolypGen, TissueNet, FIVES, and SpermHealth datasets, respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.07720v4-abstract-full').style.display = 'none'; document.getElementById('2407.07720v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages, 9 figures, under review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.07419">arXiv:2407.07419</a> <span> [<a href="https://arxiv.org/pdf/2407.07419">pdf</a>, <a href="https://arxiv.org/format/2407.07419">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/LPT.2024.3457870">10.1109/LPT.2024.3457870 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Timing Recovery for Non-Orthogonal Multiple Access with Asynchronous Clocks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Lu%2C+Q">Qingxin Lu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+H">Haide Wang</a>, <a href="/search/eess?searchtype=author&query=Mo%2C+W">Wenxuan Mo</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+J">Ji Zhou</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+W">Weiping Liu</a>, <a href="/search/eess?searchtype=author&query=Yu%2C+C">Changyuan Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.07419v2-abstract-short" style="display: inline;"> A passive optical network (PON) based on non-orthogonal multiple access (NOMA) meets low latency and high capacity. In the NOMA-PON, the asynchronous clocks between the strong and weak optical network units (ONUs) cause the timing error and phase noise on the signal of the weak ONU. The theoretical derivation shows that the timing error and phase noise can be independently compensated. In this Let… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.07419v2-abstract-full').style.display = 'inline'; document.getElementById('2407.07419v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.07419v2-abstract-full" style="display: none;"> A passive optical network (PON) based on non-orthogonal multiple access (NOMA) meets low latency and high capacity. In the NOMA-PON, the asynchronous clocks between the strong and weak optical network units (ONUs) cause the timing error and phase noise on the signal of the weak ONU. The theoretical derivation shows that the timing error and phase noise can be independently compensated. In this Letter, we propose a timing recovery (TR) algorithm based on an absolute timing error detector (Abs TED) and a pilot-based carrier phase recovery (CPR) to eliminate the timing error and phase noise separately. An experiment for 25G NOMA-PON is set up to verify the feasibility of the proposed algorithms. The weak ONU can achieve the 20% soft-decision forward error correction limit after compensating for timing error and phase noise. In conclusion, the proposed TR and the pilot-based CPR show great potential for the NOMA-PON. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.07419v2-abstract-full').style.display = 'none'; document.getElementById('2407.07419v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The Letter has been submitted to the IEEE Photonics Technology Letters</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.19677">arXiv:2406.19677</a> <span> [<a href="https://arxiv.org/pdf/2406.19677">pdf</a>, <a href="https://arxiv.org/format/2406.19677">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> End-to-End Uplink Performance Analysis of Satellite-Based IoT Networks: A Stochastic Geometry Approach </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhou%2C+J">Jiusi Zhou</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+R">Ruibo Wang</a>, <a href="/search/eess?searchtype=author&query=Shihada%2C+B">Basem Shihada</a>, <a href="/search/eess?searchtype=author&query=Alouini%2C+M">Mohamed-Slim Alouini</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.19677v1-abstract-short" style="display: inline;"> With the deployment of satellite constellations, Internet-of-Things (IoT) devices in remote areas have gained access to low-cost network connectivity. In this paper, we investigate the performance of IoT devices connecting in up-link through low Earth orbit (LEO) satellites to geosynchronous equatorial orbit (GEO) links. We model the dynamic LEO satellite constellation using the stochastic geometr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.19677v1-abstract-full').style.display = 'inline'; document.getElementById('2406.19677v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.19677v1-abstract-full" style="display: none;"> With the deployment of satellite constellations, Internet-of-Things (IoT) devices in remote areas have gained access to low-cost network connectivity. In this paper, we investigate the performance of IoT devices connecting in up-link through low Earth orbit (LEO) satellites to geosynchronous equatorial orbit (GEO) links. We model the dynamic LEO satellite constellation using the stochastic geometry method and provide an analysis of end-to-end availability with low-complexity and coverage performance estimates for the mentioned link. Based on the analytical expressions derived in this research, we make a sound investigation on the impact of constellation configuration, transmission power, and the relative positions of IoT devices and GEO satellites on end-to-end performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.19677v1-abstract-full').style.display = 'none'; document.getElementById('2406.19677v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.14534">arXiv:2406.14534</a> <span> [<a href="https://arxiv.org/pdf/2406.14534">pdf</a>, <a href="https://arxiv.org/format/2406.14534">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Epicardium Prompt-guided Real-time Cardiac Ultrasound Frame-to-volume Registration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Lei%2C+L">Long Lei</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+J">Jun Zhou</a>, <a href="/search/eess?searchtype=author&query=Pei%2C+J">Jialun Pei</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+B">Baoliang Zhao</a>, <a href="/search/eess?searchtype=author&query=Jin%2C+Y">Yueming Jin</a>, <a href="/search/eess?searchtype=author&query=Teoh%2C+Y+J">Yuen-Chun Jeremy Teoh</a>, <a href="/search/eess?searchtype=author&query=Qin%2C+J">Jing Qin</a>, <a href="/search/eess?searchtype=author&query=Heng%2C+P">Pheng-Ann Heng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.14534v3-abstract-short" style="display: inline;"> A comprehensive guidance view for cardiac interventional surgery can be provided by the real-time fusion of the intraoperative 2D images and preoperative 3D volume based on the ultrasound frame-to-volume registration. However, cardiac ultrasound images are characterized by a low signal-to-noise ratio and small differences between adjacent frames, coupled with significant dimension variations betwe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.14534v3-abstract-full').style.display = 'inline'; document.getElementById('2406.14534v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.14534v3-abstract-full" style="display: none;"> A comprehensive guidance view for cardiac interventional surgery can be provided by the real-time fusion of the intraoperative 2D images and preoperative 3D volume based on the ultrasound frame-to-volume registration. However, cardiac ultrasound images are characterized by a low signal-to-noise ratio and small differences between adjacent frames, coupled with significant dimension variations between 2D frames and 3D volumes to be registered, resulting in real-time and accurate cardiac ultrasound frame-to-volume registration being a very challenging task. This paper introduces a lightweight end-to-end Cardiac Ultrasound frame-to-volume Registration network, termed CU-Reg. Specifically, the proposed model leverages epicardium prompt-guided anatomical clues to reinforce the interaction of 2D sparse and 3D dense features, followed by a voxel-wise local-global aggregation of enhanced features, thereby boosting the cross-dimensional matching effectiveness of low-quality ultrasound modalities. We further embed an inter-frame discriminative regularization term within the hybrid supervised learning to increase the distinction between adjacent slices in the same ultrasound volume to ensure registration stability. Experimental results on the reprocessed CAMUS dataset demonstrate that our CU-Reg surpasses existing methods in terms of registration accuracy and efficiency, meeting the guidance requirements of clinical cardiac interventional surgery. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.14534v3-abstract-full').style.display = 'none'; document.getElementById('2406.14534v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper has been accepted by MICCAI 2024</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Zhou%2C+J&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Zhou%2C+J&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhou%2C+J&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhou%2C+J&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhou%2C+J&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhou%2C+J&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhou%2C+J&start=250" class="pagination-link " aria-label="Page 6" aria-current="page">6 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhou%2C+J&start=300" class="pagination-link " aria-label="Page 7" aria-current="page">7 </a> </li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository