Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 261 results for author: <span class="mathjax">Zhang, M</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/eess" aria-role="search"> Searching in archive <strong>eess</strong>. <a href="/search/?searchtype=author&query=Zhang%2C+M">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Zhang, M"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Zhang%2C+M&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Zhang, M"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Zhang%2C+M&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Zhang%2C+M&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+M&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+M&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+M&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+M&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+M&start=250" class="pagination-link " aria-label="Page 6" aria-current="page">6 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.18199">arXiv:2411.18199</a> <span> [<a href="https://arxiv.org/pdf/2411.18199">pdf</a>, <a href="https://arxiv.org/format/2411.18199">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Semantic Edge Computing and Semantic Communications in 6G Networks: A Unifying Survey and Research Challenges </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+M">Milin Zhang</a>, <a href="/search/eess?searchtype=author&query=Abdi%2C+M">Mohammad Abdi</a>, <a href="/search/eess?searchtype=author&query=Dasari%2C+V+R">Venkat R. Dasari</a>, <a href="/search/eess?searchtype=author&query=Restuccia%2C+F">Francesco Restuccia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.18199v1-abstract-short" style="display: inline;"> Semantic Edge Computing (SEC) and Semantic Communications (SemComs) have been proposed as viable approaches to achieve real-time edge-enabled intelligence in sixth-generation (6G) wireless networks. On one hand, SemCom leverages the strength of Deep Neural Networks (DNNs) to encode and communicate the semantic information only, while making it robust to channel distortions by compensating for wire… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18199v1-abstract-full').style.display = 'inline'; document.getElementById('2411.18199v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.18199v1-abstract-full" style="display: none;"> Semantic Edge Computing (SEC) and Semantic Communications (SemComs) have been proposed as viable approaches to achieve real-time edge-enabled intelligence in sixth-generation (6G) wireless networks. On one hand, SemCom leverages the strength of Deep Neural Networks (DNNs) to encode and communicate the semantic information only, while making it robust to channel distortions by compensating for wireless effects. Ultimately, this leads to an improvement in the communication efficiency. On the other hand, SEC has leveraged distributed DNNs to divide the computation of a DNN across different devices based on their computational and networking constraints. Although significant progress has been made in both fields, the literature lacks a systematic view to connect both fields. In this work, we fulfill the current gap by unifying the SEC and SemCom fields. We summarize the research problems in these two fields and provide a comprehensive review of the state of the art with a focus on their technical strengths and challenges. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18199v1-abstract-full').style.display = 'none'; document.getElementById('2411.18199v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to ACM Computing Surveys (CSUR)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13159">arXiv:2411.13159</a> <span> [<a href="https://arxiv.org/pdf/2411.13159">pdf</a>, <a href="https://arxiv.org/format/2411.13159">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Hard-Synth: Synthesizing Diverse Hard Samples for ASR using Zero-Shot TTS and LLM </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yu%2C+J">Jiawei Yu</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yuang Li</a>, <a href="/search/eess?searchtype=author&query=Qiao%2C+X">Xiaosong Qiao</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+H">Huan Zhao</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+X">Xiaofeng Zhao</a>, <a href="/search/eess?searchtype=author&query=Tang%2C+W">Wei Tang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+M">Min Zhang</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+H">Hao Yang</a>, <a href="/search/eess?searchtype=author&query=Su%2C+J">Jinsong Su</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13159v1-abstract-short" style="display: inline;"> Text-to-speech (TTS) models have been widely adopted to enhance automatic speech recognition (ASR) systems using text-only corpora, thereby reducing the cost of labeling real speech data. Existing research primarily utilizes additional text data and predefined speech styles supported by TTS models. In this paper, we propose Hard-Synth, a novel ASR data augmentation method that leverages large lang… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13159v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13159v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13159v1-abstract-full" style="display: none;"> Text-to-speech (TTS) models have been widely adopted to enhance automatic speech recognition (ASR) systems using text-only corpora, thereby reducing the cost of labeling real speech data. Existing research primarily utilizes additional text data and predefined speech styles supported by TTS models. In this paper, we propose Hard-Synth, a novel ASR data augmentation method that leverages large language models (LLMs) and advanced zero-shot TTS. Our approach employs LLMs to generate diverse in-domain text through rewriting, without relying on additional text data. Rather than using predefined speech styles, we introduce a hard prompt selection method with zero-shot TTS to clone speech styles that the ASR model finds challenging to recognize. Experiments demonstrate that Hard-Synth significantly enhances the Conformer model, achieving relative word error rate (WER) reductions of 6.5\%/4.4\% on LibriSpeech dev/test-other subsets. Additionally, we show that Hard-Synth is data-efficient and capable of reducing bias in ASR. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13159v1-abstract-full').style.display = 'none'; document.getElementById('2411.13159v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09783">arXiv:2411.09783</a> <span> [<a href="https://arxiv.org/pdf/2411.09783">pdf</a>, <a href="https://arxiv.org/format/2411.09783">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Exploring the Use of Autonomous Unmanned Vehicles for Supporting Power Grid Operations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhou%2C+Y">Yuqi Zhou</a>, <a href="/search/eess?searchtype=author&query=Feng%2C+C">Cong Feng</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+M">Mingzhi Zhang</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+R">Rui Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09783v1-abstract-short" style="display: inline;"> This paper explores the use of autonomous unmanned vehicles for supporting power grid operations. With built-in batteries and the capability to carry additional battery energy storage, the rising number of autonomous vehicles can represent a substantial amount of capacity that is currently underutilized in the power grid. Unlike traditional electric vehicles which require drivers, the operations o… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09783v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09783v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09783v1-abstract-full" style="display: none;"> This paper explores the use of autonomous unmanned vehicles for supporting power grid operations. With built-in batteries and the capability to carry additional battery energy storage, the rising number of autonomous vehicles can represent a substantial amount of capacity that is currently underutilized in the power grid. Unlike traditional electric vehicles which require drivers, the operations of autonomous vehicles can be performed without human intervention. To guide idle vehicles to support power grids autonomously, we propose a tractable optimization-based method for effectively integrating these ``mobile batteries'' into grid operations. During real-time operations, the vehicles are strategically routed to target locations to help maintain system power balance and reduce operating costs. Numerical studies have confirmed both the validity and scalability of the proposed algorithm for efficiently integrating autonomous vehicles into routine power system operations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09783v1-abstract-full').style.display = 'none'; document.getElementById('2411.09783v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.08538">arXiv:2411.08538</a> <span> [<a href="https://arxiv.org/pdf/2411.08538">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Applied Physics">physics.app-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Intelligent Adaptive Metasurface in Complex Wireless Environments </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yang%2C+H+Q">Han Qing Yang</a>, <a href="/search/eess?searchtype=author&query=Dai%2C+J+Y">Jun Yan Dai</a>, <a href="/search/eess?searchtype=author&query=Li%2C+H+D">Hui Dong Li</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+L">Lijie Wu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+M+Z">Meng Zhen Zhang</a>, <a href="/search/eess?searchtype=author&query=Shen%2C+Z+H">Zi Hang Shen</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+S+R">Si Ran Wang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Z+X">Zheng Xing Wang</a>, <a href="/search/eess?searchtype=author&query=Tang%2C+W">Wankai Tang</a>, <a href="/search/eess?searchtype=author&query=Jin%2C+S">Shi Jin</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+J+W">Jun Wei Wu</a>, <a href="/search/eess?searchtype=author&query=Cheng%2C+Q">Qiang Cheng</a>, <a href="/search/eess?searchtype=author&query=Cui%2C+T+J">Tie Jun Cui</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.08538v1-abstract-short" style="display: inline;"> The programmable metasurface is regarded as one of the most promising transformative technologies for next-generation wireless system applications. Due to the lack of effective perception ability of the external electromagnetic environment, there are numerous challenges in the intelligent regulation of wireless channels, and it still relies on external sensors to reshape electromagnetic environmen… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08538v1-abstract-full').style.display = 'inline'; document.getElementById('2411.08538v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.08538v1-abstract-full" style="display: none;"> The programmable metasurface is regarded as one of the most promising transformative technologies for next-generation wireless system applications. Due to the lack of effective perception ability of the external electromagnetic environment, there are numerous challenges in the intelligent regulation of wireless channels, and it still relies on external sensors to reshape electromagnetic environment as desired. To address that problem, we propose an adaptive metasurface (AMS) which integrates the capabilities of acquiring wireless environment information and manipulating reflected electromagnetic (EM) waves in a programmable manner. The proposed design endows the metasurfaces with excellent capabilities to sense the complex electromagnetic field distributions around them and then dynamically manipulate the waves and signals in real time under the guidance of the sensed information, eliminating the need for prior knowledge or external inputs about the wireless environment. For verification, a prototype of the proposed AMS is constructed, and its dual capabilities of sensing and manipulation are experimentally validated. Additionally, different integrated sensing and communication (ISAC) scenarios with and without the aid of the AMS are established. The effectiveness of the AMS in enhancing communication quality is well demonstrated in complex electromagnetic environments, highlighting its beneficial application potential in future wireless systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08538v1-abstract-full').style.display = 'none'; document.getElementById('2411.08538v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21440">arXiv:2410.21440</a> <span> [<a href="https://arxiv.org/pdf/2410.21440">pdf</a>, <a href="https://arxiv.org/format/2410.21440">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Double Y-Configuration Multi Active Bridge Converter: A Single Stage Bidirectional AC-DC Converter with Simple Sinusoidal Control </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+M">Mafu Zhang</a>, <a href="/search/eess?searchtype=author&query=Zou%2C+H">Huanghaohe Zou</a>, <a href="/search/eess?searchtype=author&query=Farzamkia%2C+S">Saleh Farzamkia</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Z">Zibo Chen</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+C">Chen Chen</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+A+Q">Alex Q. Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21440v1-abstract-short" style="display: inline;"> This paper proposes a double Y-configuration multi active bridge converter (DYAB) capable of single stage bidirectional AC-DC isolated power conversion with a simple sinusoidal phase shift modulation. Compared to other dual active bridge (DAB) based AC-DC converters, the DYAB achieves power factor correction (PFC) with a simpler control method while maintaining nearly full-range zero-voltage switc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21440v1-abstract-full').style.display = 'inline'; document.getElementById('2410.21440v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21440v1-abstract-full" style="display: none;"> This paper proposes a double Y-configuration multi active bridge converter (DYAB) capable of single stage bidirectional AC-DC isolated power conversion with a simple sinusoidal phase shift modulation. Compared to other dual active bridge (DAB) based AC-DC converters, the DYAB achieves power factor correction (PFC) with a simpler control method while maintaining nearly full-range zero-voltage switching (ZVS) and relatively low circulating current, resulting in high efficiency. The paper details the equivalent model and modulation method. A steady-state analysis tool based on the fast fourier transform (FFT) is developed to calculate the ZVS range, PFC performance, and loss breakdown. A 2.66kW hardware test demonstrating an efficiency of 97.14% is implemented to verify the proposed circuit and the steady state model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21440v1-abstract-full').style.display = 'none'; document.getElementById('2410.21440v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21276">arXiv:2410.21276</a> <span> [<a href="https://arxiv.org/pdf/2410.21276">pdf</a>, <a href="https://arxiv.org/format/2410.21276">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> GPT-4o System Card </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=OpenAI"> OpenAI</a>, <a href="/search/eess?searchtype=author&query=%3A"> :</a>, <a href="/search/eess?searchtype=author&query=Hurst%2C+A">Aaron Hurst</a>, <a href="/search/eess?searchtype=author&query=Lerer%2C+A">Adam Lerer</a>, <a href="/search/eess?searchtype=author&query=Goucher%2C+A+P">Adam P. Goucher</a>, <a href="/search/eess?searchtype=author&query=Perelman%2C+A">Adam Perelman</a>, <a href="/search/eess?searchtype=author&query=Ramesh%2C+A">Aditya Ramesh</a>, <a href="/search/eess?searchtype=author&query=Clark%2C+A">Aidan Clark</a>, <a href="/search/eess?searchtype=author&query=Ostrow%2C+A">AJ Ostrow</a>, <a href="/search/eess?searchtype=author&query=Welihinda%2C+A">Akila Welihinda</a>, <a href="/search/eess?searchtype=author&query=Hayes%2C+A">Alan Hayes</a>, <a href="/search/eess?searchtype=author&query=Radford%2C+A">Alec Radford</a>, <a href="/search/eess?searchtype=author&query=M%C4%85dry%2C+A">Aleksander M膮dry</a>, <a href="/search/eess?searchtype=author&query=Baker-Whitcomb%2C+A">Alex Baker-Whitcomb</a>, <a href="/search/eess?searchtype=author&query=Beutel%2C+A">Alex Beutel</a>, <a href="/search/eess?searchtype=author&query=Borzunov%2C+A">Alex Borzunov</a>, <a href="/search/eess?searchtype=author&query=Carney%2C+A">Alex Carney</a>, <a href="/search/eess?searchtype=author&query=Chow%2C+A">Alex Chow</a>, <a href="/search/eess?searchtype=author&query=Kirillov%2C+A">Alex Kirillov</a>, <a href="/search/eess?searchtype=author&query=Nichol%2C+A">Alex Nichol</a>, <a href="/search/eess?searchtype=author&query=Paino%2C+A">Alex Paino</a>, <a href="/search/eess?searchtype=author&query=Renzin%2C+A">Alex Renzin</a>, <a href="/search/eess?searchtype=author&query=Passos%2C+A+T">Alex Tachard Passos</a>, <a href="/search/eess?searchtype=author&query=Kirillov%2C+A">Alexander Kirillov</a>, <a href="/search/eess?searchtype=author&query=Christakis%2C+A">Alexi Christakis</a> , et al. (395 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21276v1-abstract-short" style="display: inline;"> GPT-4o is an autoregressive omni model that accepts as input any combination of text, audio, image, and video, and generates any combination of text, audio, and image outputs. It's trained end-to-end across text, vision, and audio, meaning all inputs and outputs are processed by the same neural network. GPT-4o can respond to audio inputs in as little as 232 milliseconds, with an average of 320 mil… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21276v1-abstract-full').style.display = 'inline'; document.getElementById('2410.21276v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21276v1-abstract-full" style="display: none;"> GPT-4o is an autoregressive omni model that accepts as input any combination of text, audio, image, and video, and generates any combination of text, audio, and image outputs. It's trained end-to-end across text, vision, and audio, meaning all inputs and outputs are processed by the same neural network. GPT-4o can respond to audio inputs in as little as 232 milliseconds, with an average of 320 milliseconds, which is similar to human response time in conversation. It matches GPT-4 Turbo performance on text in English and code, with significant improvement on text in non-English languages, while also being much faster and 50\% cheaper in the API. GPT-4o is especially better at vision and audio understanding compared to existing models. In line with our commitment to building AI safely and consistent with our voluntary commitments to the White House, we are sharing the GPT-4o System Card, which includes our Preparedness Framework evaluations. In this System Card, we provide a detailed look at GPT-4o's capabilities, limitations, and safety evaluations across multiple categories, focusing on speech-to-speech while also evaluating text and image capabilities, and measures we've implemented to ensure the model is safe and aligned. We also include third-party assessments on dangerous capabilities, as well as discussion of potential societal impacts of GPT-4o's text and vision capabilities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21276v1-abstract-full').style.display = 'none'; document.getElementById('2410.21276v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.17377">arXiv:2410.17377</a> <span> [<a href="https://arxiv.org/pdf/2410.17377">pdf</a>, <a href="https://arxiv.org/format/2410.17377">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> PtychoFormer: A Transformer-based Model for Ptychographic Phase Retrieval </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Nakahata%2C+R">Ryuma Nakahata</a>, <a href="/search/eess?searchtype=author&query=Zaman%2C+S">Shehtab Zaman</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+M">Mingyuan Zhang</a>, <a href="/search/eess?searchtype=author&query=Lu%2C+F">Fake Lu</a>, <a href="/search/eess?searchtype=author&query=Chiu%2C+K">Kenneth Chiu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.17377v1-abstract-short" style="display: inline;"> Ptychography is a computational method of microscopy that recovers high-resolution transmission images of samples from a series of diffraction patterns. While conventional phase retrieval algorithms can iteratively recover the images, they require oversampled diffraction patterns, incur significant computational costs, and struggle to recover the absolute phase of the sample's transmission functio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17377v1-abstract-full').style.display = 'inline'; document.getElementById('2410.17377v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.17377v1-abstract-full" style="display: none;"> Ptychography is a computational method of microscopy that recovers high-resolution transmission images of samples from a series of diffraction patterns. While conventional phase retrieval algorithms can iteratively recover the images, they require oversampled diffraction patterns, incur significant computational costs, and struggle to recover the absolute phase of the sample's transmission function. Deep learning algorithms for ptychography are a promising approach to resolving the limitations of iterative algorithms. We present PtychoFormer, a hierarchical transformer-based model for data-driven single-shot ptychographic phase retrieval. PtychoFormer processes subsets of diffraction patterns, generating local inferences that are seamlessly stitched together to produce a high-quality reconstruction. Our model exhibits tolerance to sparsely scanned diffraction patterns and achieves up to 3600 times faster imaging speed than the extended ptychographic iterative engine (ePIE). We also propose the extended-PtychoFormer (ePF), a hybrid approach that combines the benefits of PtychoFormer with the ePIE. ePF minimizes global phase shifts and significantly enhances reconstruction quality, achieving state-of-the-art phase retrieval in ptychography. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17377v1-abstract-full').style.display = 'none'; document.getElementById('2410.17377v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">20 pages, 12 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.10; I.5.4 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15614">arXiv:2410.15614</a> <span> [<a href="https://arxiv.org/pdf/2410.15614">pdf</a>, <a href="https://arxiv.org/format/2410.15614">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neurons and Cognition">q-bio.NC</span> </div> </div> <p class="title is-5 mathjax"> Topology-Aware Exploration of Circle of Willis for CTA and MRA: Segmentation, Detection, and Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+M">Minghui Zhang</a>, <a href="/search/eess?searchtype=author&query=You%2C+X">Xin You</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+H">Hanxiao Zhang</a>, <a href="/search/eess?searchtype=author&query=Gu%2C+Y">Yun Gu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15614v1-abstract-short" style="display: inline;"> The Circle of Willis (CoW) vessels is critical to connecting major circulations of the brain. The topology of the vascular structure is clinical significance to evaluate the risk, severity of the neuro-vascular diseases. The CoW has two representative angiographic imaging modalities, computed tomography angiography (CTA) and magnetic resonance angiography (MRA). TopCow24 provided 125 paired CTA-MR… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15614v1-abstract-full').style.display = 'inline'; document.getElementById('2410.15614v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15614v1-abstract-full" style="display: none;"> The Circle of Willis (CoW) vessels is critical to connecting major circulations of the brain. The topology of the vascular structure is clinical significance to evaluate the risk, severity of the neuro-vascular diseases. The CoW has two representative angiographic imaging modalities, computed tomography angiography (CTA) and magnetic resonance angiography (MRA). TopCow24 provided 125 paired CTA-MRA dataset for the analysis of CoW. To explore both CTA and MRA images in a unified framework to learn the inherent topology of Cow, we construct the universal dataset via independent intensity preprocess, followed by joint resampling and normarlization. Then, we utilize the topology-aware loss to enhance the topology completeness of the CoW and the discrimination between different classes. A complementary topology-aware refinement is further conducted to enhance the connectivity within the same class. Our method was evaluated on all the three tasks and two modalities, achieving competitive results. In the final test phase of TopCow24 Challenge, we achieved the second place in the CTA-Seg-Task, the third palce in the CTA-Box-Task, the first place in the CTA-Edg-Task, the second place in the MRA-Seg-Task, the third palce in the MRA-Box-Task, the second place in the MRA-Edg-Task. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15614v1-abstract-full').style.display = 'none'; document.getElementById('2410.15614v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Participation technical report for TopCoW24 challenge @ MICCAI 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.14971">arXiv:2410.14971</a> <span> [<a href="https://arxiv.org/pdf/2410.14971">pdf</a>, <a href="https://arxiv.org/format/2410.14971">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> BrainECHO: Semantic Brain Signal Decoding through Vector-Quantized Spectrogram Reconstruction for Whisper-Enhanced Text Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+J">Jilong Li</a>, <a href="/search/eess?searchtype=author&query=Song%2C+Z">Zhenxi Song</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+J">Jiaqi Wang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+M">Min Zhang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Z">Zhiguo Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.14971v1-abstract-short" style="display: inline;"> Recent advances in decoding language from brain signals (EEG and MEG) have been significantly driven by pre-trained language models, leading to remarkable progress on publicly available non-invasive EEG/MEG datasets. However, previous works predominantly utilize teacher forcing during text generation, leading to significant performance drops without its use. A fundamental issue is the inability to… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14971v1-abstract-full').style.display = 'inline'; document.getElementById('2410.14971v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.14971v1-abstract-full" style="display: none;"> Recent advances in decoding language from brain signals (EEG and MEG) have been significantly driven by pre-trained language models, leading to remarkable progress on publicly available non-invasive EEG/MEG datasets. However, previous works predominantly utilize teacher forcing during text generation, leading to significant performance drops without its use. A fundamental issue is the inability to establish a unified feature space correlating textual data with the corresponding evoked brain signals. Although some recent studies attempt to mitigate this gap using an audio-text pre-trained model, Whisper, which is favored for its signal input modality, they still largely overlook the inherent differences between audio signals and brain signals in directly applying Whisper to decode brain signals. To address these limitations, we propose a new multi-stage strategy for semantic brain signal decoding via vEctor-quantized speCtrogram reconstruction for WHisper-enhanced text generatiOn, termed BrainECHO. Specifically, BrainECHO successively conducts: 1) Discrete autoencoding of the audio spectrogram; 2) Brain-audio latent space alignment; and 3) Semantic text generation via Whisper finetuning. Through this autoencoding--alignment--finetuning process, BrainECHO outperforms state-of-the-art methods under the same data split settings on two widely accepted resources: the EEG dataset (Brennan) and the MEG dataset (GWilliams). The innovation of BrainECHO, coupled with its robustness and superiority at the sentence, session, and subject-independent levels across public datasets, underscores its significance for language-based brain-computer interfaces. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14971v1-abstract-full').style.display = 'none'; document.getElementById('2410.14971v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.14965">arXiv:2410.14965</a> <span> [<a href="https://arxiv.org/pdf/2410.14965">pdf</a>, <a href="https://arxiv.org/format/2410.14965">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Non-Invasive to Invasive: Enhancing FFA Synthesis from CFP with a Benchmark Dataset and a Novel Network </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+H">Hongqiu Wang</a>, <a href="/search/eess?searchtype=author&query=Xing%2C+Z">Zhaohu Xing</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+W">Weitong Wu</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+Y">Yijun Yang</a>, <a href="/search/eess?searchtype=author&query=Tang%2C+Q">Qingqing Tang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+M">Meixia Zhang</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+Y">Yanwu Xu</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+L">Lei Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.14965v1-abstract-short" style="display: inline;"> Fundus imaging is a pivotal tool in ophthalmology, and different imaging modalities are characterized by their specific advantages. For example, Fundus Fluorescein Angiography (FFA) uniquely provides detailed insights into retinal vascular dynamics and pathology, surpassing Color Fundus Photographs (CFP) in detecting microvascular abnormalities and perfusion status. However, the conventional invas… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14965v1-abstract-full').style.display = 'inline'; document.getElementById('2410.14965v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.14965v1-abstract-full" style="display: none;"> Fundus imaging is a pivotal tool in ophthalmology, and different imaging modalities are characterized by their specific advantages. For example, Fundus Fluorescein Angiography (FFA) uniquely provides detailed insights into retinal vascular dynamics and pathology, surpassing Color Fundus Photographs (CFP) in detecting microvascular abnormalities and perfusion status. However, the conventional invasive FFA involves discomfort and risks due to fluorescein dye injection, and it is meaningful but challenging to synthesize FFA images from non-invasive CFP. Previous studies primarily focused on FFA synthesis in a single disease category. In this work, we explore FFA synthesis in multiple diseases by devising a Diffusion-guided generative adversarial network, which introduces an adaptive and dynamic diffusion forward process into the discriminator and adds a category-aware representation enhancer. Moreover, to facilitate this research, we collect the first multi-disease CFP and FFA paired dataset, named the Multi-disease Paired Ocular Synthesis (MPOS) dataset, with four different fundus diseases. Experimental results show that our FFA synthesis network can generate better FFA images compared to state-of-the-art methods. Furthermore, we introduce a paired-modal diagnostic network to validate the effectiveness of synthetic FFA images in the diagnosis of multiple fundus diseases, and the results show that our synthesized FFA images with the real CFP images have higher diagnosis accuracy than that of the compared FFA synthesizing methods. Our research bridges the gap between non-invasive imaging and FFA, thereby offering promising prospects to enhance ophthalmic diagnosis and patient care, with a focus on reducing harm to patients through non-invasive procedures. Our dataset and code will be released to support further research in this field (https://github.com/whq-xxh/FFA-Synthesis). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14965v1-abstract-full').style.display = 'none'; document.getElementById('2410.14965v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ACMMM 24 MCHM</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.08318">arXiv:2410.08318</a> <span> [<a href="https://arxiv.org/pdf/2410.08318">pdf</a>, <a href="https://arxiv.org/ps/2410.08318">ps</a>, <a href="https://arxiv.org/format/2410.08318">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Meta-Learning-Driven Adaptive Codebook Design for Near-Field Communications </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+M">Mianyi Zhang</a>, <a href="/search/eess?searchtype=author&query=Cai%2C+Y">Yunlong Cai</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+J">Jiaqi Xu</a>, <a href="/search/eess?searchtype=author&query=Swindlehurst%2C+A+L">A. Lee Swindlehurst</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.08318v1-abstract-short" style="display: inline;"> Extremely large-scale arrays (XL-arrays) and ultra-high frequencies are two key technologies for sixth-generation (6G) networks, offering higher system capacity and expanded bandwidth resources. To effectively combine these technologies, it is necessary to consider the near-field spherical-wave propagation model, rather than the traditional far-field planar-wave model. In this paper, we explore a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08318v1-abstract-full').style.display = 'inline'; document.getElementById('2410.08318v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.08318v1-abstract-full" style="display: none;"> Extremely large-scale arrays (XL-arrays) and ultra-high frequencies are two key technologies for sixth-generation (6G) networks, offering higher system capacity and expanded bandwidth resources. To effectively combine these technologies, it is necessary to consider the near-field spherical-wave propagation model, rather than the traditional far-field planar-wave model. In this paper, we explore a near-field communication system comprising a base station (BS) with hybrid analog-digital beamforming and multiple mobile users. Our goal is to maximize the system's sum-rate by optimizing the near-field codebook design for hybrid precoding. To enable fast adaptation to varying user distributions, we propose a meta-learning-based framework that integrates the model-agnostic meta-learning (MAML) algorithm with a codebook learning network. Specifically, we first design a deep neural network (DNN) to learn the near-field codebook. Then, we combine the MAML algorithm with the DNN to allow rapid adaptation to different channel conditions by leveraging a well-initialized model from the outer network. Simulation results demonstrate that our proposed framework outperforms conventional algorithms, offering improved generalization and better overall performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08318v1-abstract-full').style.display = 'none'; document.getElementById('2410.08318v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.03798">arXiv:2410.03798</a> <span> [<a href="https://arxiv.org/pdf/2410.03798">pdf</a>, <a href="https://arxiv.org/format/2410.03798">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Self-Powered LLM Modality Expansion for Large Speech-Text Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yu%2C+T">Tengfei Yu</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+X">Xuebo Liu</a>, <a href="/search/eess?searchtype=author&query=Hou%2C+Z">Zhiyi Hou</a>, <a href="/search/eess?searchtype=author&query=Ding%2C+L">Liang Ding</a>, <a href="/search/eess?searchtype=author&query=Tao%2C+D">Dacheng Tao</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+M">Min Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.03798v2-abstract-short" style="display: inline;"> Large language models (LLMs) exhibit remarkable performance across diverse tasks, indicating their potential for expansion into large speech-text models (LSMs) by integrating speech capabilities. Although unified speech-text pre-training and multimodal data instruction-tuning offer considerable benefits, these methods generally entail significant resource demands and tend to overfit specific tasks… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03798v2-abstract-full').style.display = 'inline'; document.getElementById('2410.03798v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.03798v2-abstract-full" style="display: none;"> Large language models (LLMs) exhibit remarkable performance across diverse tasks, indicating their potential for expansion into large speech-text models (LSMs) by integrating speech capabilities. Although unified speech-text pre-training and multimodal data instruction-tuning offer considerable benefits, these methods generally entail significant resource demands and tend to overfit specific tasks. This study aims to refine the use of speech datasets for LSM training by addressing the limitations of vanilla instruction tuning. We explore the instruction-following dynamics within LSMs, identifying a critical issue termed speech anchor bias-a tendency for LSMs to over-rely on speech inputs, mistakenly interpreting the entire speech modality as directives, thereby neglecting textual instructions. To counteract this bias, we introduce a self-powered LSM that leverages augmented automatic speech recognition data generated by the model itself for more effective instruction tuning. Our experiments across a range of speech-based tasks demonstrate that self-powered LSM mitigates speech anchor bias and improves the fusion of speech and text modalities in LSMs. Data, code and scripts are freely available at https://github.com/ytf-philp/Self-powered-LSM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03798v2-abstract-full').style.display = 'none'; document.getElementById('2410.03798v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to EMNLP 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.01698">arXiv:2410.01698</a> <span> [<a href="https://arxiv.org/pdf/2410.01698">pdf</a>, <a href="https://arxiv.org/format/2410.01698">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> COSMIC: Compress Satellite Images Efficiently via Diffusion Compensation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+Z">Ziyuan Zhang</a>, <a href="/search/eess?searchtype=author&query=Qiu%2C+H">Han Qiu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+M">Maosen Zhang</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+J">Jun Liu</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+B">Bin Chen</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+T">Tianwei Zhang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+H">Hewu Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.01698v2-abstract-short" style="display: inline;"> With the rapidly increasing number of satellites in space and their enhanced capabilities, the amount of earth observation images collected by satellites is exceeding the transmission limits of satellite-to-ground links. Although existing learned image compression solutions achieve remarkable performance by using a sophisticated encoder to extract fruitful features as compression and using a decod… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01698v2-abstract-full').style.display = 'inline'; document.getElementById('2410.01698v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.01698v2-abstract-full" style="display: none;"> With the rapidly increasing number of satellites in space and their enhanced capabilities, the amount of earth observation images collected by satellites is exceeding the transmission limits of satellite-to-ground links. Although existing learned image compression solutions achieve remarkable performance by using a sophisticated encoder to extract fruitful features as compression and using a decoder to reconstruct, it is still hard to directly deploy those complex encoders on current satellites' embedded GPUs with limited computing capability and power supply to compress images in orbit. In this paper, we propose COSMIC, a simple yet effective learned compression solution to transmit satellite images. We first design a lightweight encoder (i.e. reducing FLOPs by 2.6~5x) on satellite to achieve a high image compression ratio to save satellite-to-ground links. Then, for reconstructions on the ground, to deal with the feature extraction ability degradation due to simplifying encoders, we propose a diffusion-based model to compensate image details when decoding. Our insight is that satellite's earth observation photos are not just images but indeed multi-modal data with a nature of Text-to-Image pairing since they are collected with rich sensor data (e.g. coordinates, timestamp, etc.) that can be used as the condition for diffusion generation. Extensive experiments show that COSMIC outperforms state-of-the-art baselines on both perceptual and distortion metrics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01698v2-abstract-full').style.display = 'none'; document.getElementById('2410.01698v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.19688">arXiv:2409.19688</a> <span> [<a href="https://arxiv.org/pdf/2409.19688">pdf</a>, <a href="https://arxiv.org/format/2409.19688">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Machine Learning for Raman Spectroscopy-based Cyber-Marine Fish Biochemical Composition Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhou%2C+Y">Yun Zhou</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+G">Gang Chen</a>, <a href="/search/eess?searchtype=author&query=Xue%2C+B">Bing Xue</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+M">Mengjie Zhang</a>, <a href="/search/eess?searchtype=author&query=Rooney%2C+J+S">Jeremy S. Rooney</a>, <a href="/search/eess?searchtype=author&query=Lagutin%2C+K">Kirill Lagutin</a>, <a href="/search/eess?searchtype=author&query=MacKenzie%2C+A">Andrew MacKenzie</a>, <a href="/search/eess?searchtype=author&query=Gordon%2C+K+C">Keith C. Gordon</a>, <a href="/search/eess?searchtype=author&query=Killeen%2C+D+P">Daniel P. Killeen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.19688v1-abstract-short" style="display: inline;"> The rapid and accurate detection of biochemical compositions in fish is a crucial real-world task that facilitates optimal utilization and extraction of high-value products in the seafood industry. Raman spectroscopy provides a promising solution for quickly and non-destructively analyzing the biochemical composition of fish by associating Raman spectra with biochemical reference data using machin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19688v1-abstract-full').style.display = 'inline'; document.getElementById('2409.19688v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.19688v1-abstract-full" style="display: none;"> The rapid and accurate detection of biochemical compositions in fish is a crucial real-world task that facilitates optimal utilization and extraction of high-value products in the seafood industry. Raman spectroscopy provides a promising solution for quickly and non-destructively analyzing the biochemical composition of fish by associating Raman spectra with biochemical reference data using machine learning regression models. This paper investigates different regression models to address this task and proposes a new design of Convolutional Neural Networks (CNNs) for jointly predicting water, protein, and lipids yield. To the best of our knowledge, we are the first to conduct a successful study employing CNNs to analyze the biochemical composition of fish based on a very small Raman spectroscopic dataset. Our approach combines a tailored CNN architecture with the comprehensive data preparation procedure, effectively mitigating the challenges posed by extreme data scarcity. The results demonstrate that our CNN can significantly outperform two state-of-the-art CNN models and multiple traditional machine learning models, paving the way for accurate and automated analysis of fish biochemical composition. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19688v1-abstract-full').style.display = 'none'; document.getElementById('2409.19688v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.19185">arXiv:2409.19185</a> <span> [<a href="https://arxiv.org/pdf/2409.19185">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Semi-Supervised Bone Marrow Lesion Detection from Knee MRI Segmentation Using Mask Inpainting Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Qin%2C+S">Shihua Qin</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+M">Ming Zhang</a>, <a href="/search/eess?searchtype=author&query=Shan%2C+J">Juan Shan</a>, <a href="/search/eess?searchtype=author&query=Shin%2C+T">Taehoon Shin</a>, <a href="/search/eess?searchtype=author&query=Woo%2C+J">Jonghye Woo</a>, <a href="/search/eess?searchtype=author&query=Xing%2C+F">Fangxu Xing</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.19185v1-abstract-short" style="display: inline;"> Bone marrow lesions (BMLs) are critical indicators of knee osteoarthritis (OA). Since they often appear as small, irregular structures with indistinguishable edges in knee magnetic resonance images (MRIs), effective detection of BMLs in MRI is vital for OA diagnosis and treatment. This paper proposes a semi-supervised local anomaly detection method using mask inpainting models for identification o… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19185v1-abstract-full').style.display = 'inline'; document.getElementById('2409.19185v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.19185v1-abstract-full" style="display: none;"> Bone marrow lesions (BMLs) are critical indicators of knee osteoarthritis (OA). Since they often appear as small, irregular structures with indistinguishable edges in knee magnetic resonance images (MRIs), effective detection of BMLs in MRI is vital for OA diagnosis and treatment. This paper proposes a semi-supervised local anomaly detection method using mask inpainting models for identification of BMLs in high-resolution knee MRI, effectively integrating a 3D femur bone segmentation model, a large mask inpainting model, and a series of post-processing techniques. The method was evaluated using MRIs at various resolutions from a subset of the public Osteoarthritis Initiative database. Dice score, Intersection over Union (IoU), and pixel-level sensitivity, specificity, and accuracy showed an advantage over the multiresolution knowledge distillation method-a state-of-the-art global anomaly detection method. Especially, segmentation performance is enhanced on higher-resolution images, achieving an over two times performance increase on the Dice score and the IoU score at a 448x448 resolution level. We also demonstrate that with increasing size of the BML region, both the Dice and IoU scores improve as the proportion of distinguishable boundary decreases. The identified BML masks can serve as markers for downstream tasks such as segmentation and classification. The proposed method has shown a potential in improving BML detection, laying a foundation for further advances in imaging-based OA research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19185v1-abstract-full').style.display = 'none'; document.getElementById('2409.19185v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 3 figures, submitted to SPIE Conference on Image Processing</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.18304">arXiv:2409.18304</a> <span> [<a href="https://arxiv.org/pdf/2409.18304">pdf</a>, <a href="https://arxiv.org/format/2409.18304">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Multi-platoon car-following models with flexible platoon sizes and communication levels </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Hui%2C+S">Shouwei Hui</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+M">Michael Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.18304v1-abstract-short" style="display: inline;"> In this paper, we extend a single platoon car-following (CF) model to some multi-platoon CF models for connected and autonomous vehicles (CAVs) with flexible platoon size and communication level. Specifically, we consider forward and backward communication methods between platoons with delays. Some general results of linear stability are mathematically proven, and numerical simulations are perform… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18304v1-abstract-full').style.display = 'inline'; document.getElementById('2409.18304v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.18304v1-abstract-full" style="display: none;"> In this paper, we extend a single platoon car-following (CF) model to some multi-platoon CF models for connected and autonomous vehicles (CAVs) with flexible platoon size and communication level. Specifically, we consider forward and backward communication methods between platoons with delays. Some general results of linear stability are mathematically proven, and numerical simulations are performed to illustrate the effects of platoon sizes and communication levels, as well as to demonstrate the potential for stabilizing human-driven vehicles (HDVs) in mixed traffic conditions. The simulation results are consistent with theoretical analysis, and demonstrate that in the ring road scenario, CAV platoons can stabilize certain percentage of HDVs. This paper can provide suggestions for the design of communication system of autonomous vehicles (AVs), and management of mixed traffic flow of CAVs and HDVs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18304v1-abstract-full').style.display = 'none'; document.getElementById('2409.18304v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Preprint for IEEE</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.13262">arXiv:2409.13262</a> <span> [<a href="https://arxiv.org/pdf/2409.13262">pdf</a>, <a href="https://arxiv.org/format/2409.13262">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Large Language Model Should Understand Pinyin for Chinese ASR Error Correction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yuang Li</a>, <a href="/search/eess?searchtype=author&query=Qiao%2C+X">Xiaosong Qiao</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+X">Xiaofeng Zhao</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+H">Huan Zhao</a>, <a href="/search/eess?searchtype=author&query=Tang%2C+W">Wei Tang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+M">Min Zhang</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+H">Hao Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.13262v1-abstract-short" style="display: inline;"> Large language models can enhance automatic speech recognition systems through generative error correction. In this paper, we propose Pinyin-enhanced GEC, which leverages Pinyi, the phonetic representation of Mandarin Chinese, as supplementary information to improve Chinese ASR error correction. Our approach only utilizes synthetic errors for training and employs the one-best hypothesis during inf… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13262v1-abstract-full').style.display = 'inline'; document.getElementById('2409.13262v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.13262v1-abstract-full" style="display: none;"> Large language models can enhance automatic speech recognition systems through generative error correction. In this paper, we propose Pinyin-enhanced GEC, which leverages Pinyi, the phonetic representation of Mandarin Chinese, as supplementary information to improve Chinese ASR error correction. Our approach only utilizes synthetic errors for training and employs the one-best hypothesis during inference. Additionally, we introduce a multitask training approach involving conversion tasks between Pinyin and text to align their feature spaces. Experiments on the Aishell-1 and the Common Voice datasets demonstrate that our approach consistently outperforms GEC with text-only input. More importantly, we provide intuitive explanations for the effectiveness of PY-GEC and multitask training from two aspects: 1) increased attention weight on Pinyin features; and 2) aligned feature space between Pinyin and text hidden states. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13262v1-abstract-full').style.display = 'none'; document.getElementById('2409.13262v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.10890">arXiv:2409.10890</a> <span> [<a href="https://arxiv.org/pdf/2409.10890">pdf</a>, <a href="https://arxiv.org/format/2409.10890">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SkinMamba: A Precision Skin Lesion Segmentation Architecture with Cross-Scale Global State Modeling and Frequency Boundary Guidance </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zou%2C+S">Shun Zou</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+M">Mingya Zhang</a>, <a href="/search/eess?searchtype=author&query=Fan%2C+B">Bingjian Fan</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+Z">Zhengyi Zhou</a>, <a href="/search/eess?searchtype=author&query=Zou%2C+X">Xiuguo Zou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.10890v1-abstract-short" style="display: inline;"> Skin lesion segmentation is a crucial method for identifying early skin cancer. In recent years, both convolutional neural network (CNN) and Transformer-based methods have been widely applied. Moreover, combining CNN and Transformer effectively integrates global and local relationships, but remains limited by the quadratic complexity of Transformer. To address this, we propose a hybrid architectur… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10890v1-abstract-full').style.display = 'inline'; document.getElementById('2409.10890v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.10890v1-abstract-full" style="display: none;"> Skin lesion segmentation is a crucial method for identifying early skin cancer. In recent years, both convolutional neural network (CNN) and Transformer-based methods have been widely applied. Moreover, combining CNN and Transformer effectively integrates global and local relationships, but remains limited by the quadratic complexity of Transformer. To address this, we propose a hybrid architecture based on Mamba and CNN, called SkinMamba. It maintains linear complexity while offering powerful long-range dependency modeling and local feature extraction capabilities. Specifically, we introduce the Scale Residual State Space Block (SRSSB), which captures global contextual relationships and cross-scale information exchange at a macro level, enabling expert communication in a global state. This effectively addresses challenges in skin lesion segmentation related to varying lesion sizes and inconspicuous target areas. Additionally, to mitigate boundary blurring and information loss during model downsampling, we introduce the Frequency Boundary Guided Module (FBGM), providing sufficient boundary priors to guide precise boundary segmentation, while also using the retained information to assist the decoder in the decoding process. Finally, we conducted comparative and ablation experiments on two public lesion segmentation datasets (ISIC2017 and ISIC2018), and the results demonstrate the strong competitiveness of SkinMamba in skin lesion segmentation tasks. The code is available at https://github.com/zs1314/SkinMamba. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10890v1-abstract-full').style.display = 'none'; document.getElementById('2409.10890v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to ACCV2024 workshop</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.08597">arXiv:2409.08597</a> <span> [<a href="https://arxiv.org/pdf/2409.08597">pdf</a>, <a href="https://arxiv.org/format/2409.08597">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> LA-RAG:Enhancing LLM-based ASR Accuracy with Retrieval-Augmented Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+S">Shaojun Li</a>, <a href="/search/eess?searchtype=author&query=Shang%2C+H">Hengchao Shang</a>, <a href="/search/eess?searchtype=author&query=Wei%2C+D">Daimeng Wei</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+J">Jiaxin Guo</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Z">Zongyao Li</a>, <a href="/search/eess?searchtype=author&query=He%2C+X">Xianghui He</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+M">Min Zhang</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+H">Hao Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.08597v1-abstract-short" style="display: inline;"> Recent advancements in integrating speech information into large language models (LLMs) have significantly improved automatic speech recognition (ASR) accuracy. However, existing methods often constrained by the capabilities of the speech encoders under varied acoustic conditions, such as accents. To address this, we propose LA-RAG, a novel Retrieval-Augmented Generation (RAG) paradigm for LLM-bas… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08597v1-abstract-full').style.display = 'inline'; document.getElementById('2409.08597v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.08597v1-abstract-full" style="display: none;"> Recent advancements in integrating speech information into large language models (LLMs) have significantly improved automatic speech recognition (ASR) accuracy. However, existing methods often constrained by the capabilities of the speech encoders under varied acoustic conditions, such as accents. To address this, we propose LA-RAG, a novel Retrieval-Augmented Generation (RAG) paradigm for LLM-based ASR. LA-RAG leverages fine-grained token-level speech datastores and a speech-to-speech retrieval mechanism to enhance ASR accuracy via LLM in-context learning (ICL) capabilities. Experiments on Mandarin and various Chinese dialect datasets demonstrate significant improvements in ASR accuracy compared to existing methods, validating the effectiveness of our approach, especially in handling accent variations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08597v1-abstract-full').style.display = 'none'; document.getElementById('2409.08597v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">submitted to ICASSP 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.05004">arXiv:2409.05004</a> <span> [<a href="https://arxiv.org/pdf/2409.05004">pdf</a>, <a href="https://arxiv.org/format/2409.05004">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Disentangling the Prosody and Semantic Information with Pre-trained Model for In-Context Learning based Zero-Shot Voice Conversion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Chen%2C+Z">Zhengyang Chen</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+S">Shuai Wang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+M">Mingyang Zhang</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+X">Xuechen Liu</a>, <a href="/search/eess?searchtype=author&query=Yamagishi%2C+J">Junichi Yamagishi</a>, <a href="/search/eess?searchtype=author&query=Qian%2C+Y">Yanmin Qian</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.05004v2-abstract-short" style="display: inline;"> Voice conversion (VC) aims to modify the speaker's timbre while retaining speech content. Previous approaches have tokenized the outputs from self-supervised into semantic tokens, facilitating disentanglement of speech content information. Recently, in-context learning (ICL) has emerged in text-to-speech (TTS) systems for effectively modeling specific characteristics such as timbre through context… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05004v2-abstract-full').style.display = 'inline'; document.getElementById('2409.05004v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.05004v2-abstract-full" style="display: none;"> Voice conversion (VC) aims to modify the speaker's timbre while retaining speech content. Previous approaches have tokenized the outputs from self-supervised into semantic tokens, facilitating disentanglement of speech content information. Recently, in-context learning (ICL) has emerged in text-to-speech (TTS) systems for effectively modeling specific characteristics such as timbre through context conditioning. This paper proposes an ICL capability enhanced VC system (ICL-VC) employing a mask and reconstruction training strategy based on flow-matching generative models. Augmented with semantic tokens, our experiments on the LibriTTS dataset demonstrate that ICL-VC improves speaker similarity. Additionally, we find that k-means is a versatile tokenization method applicable to various pre-trained models. However, the ICL-VC system faces challenges in preserving the prosody of the source speech. To mitigate this issue, we propose incorporating prosody embeddings extracted from a pre-trained emotion recognition model into our system. Integration of prosody embeddings notably enhances the system's capability to preserve source speech prosody, as validated on the Emotional Speech Database. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05004v2-abstract-full').style.display = 'none'; document.getElementById('2409.05004v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.03878">arXiv:2409.03878</a> <span> [<a href="https://arxiv.org/pdf/2409.03878">pdf</a>, <a href="https://arxiv.org/format/2409.03878">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Geophysics">physics.geo-ph</span> </div> </div> <p class="title is-5 mathjax"> Ground-roll Separation From Land Seismic Records Based on Convolutional Neural Network </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Jia%2C+Z">Zhuang Jia</a>, <a href="/search/eess?searchtype=author&query=Lu%2C+W">Wenkai Lu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+M">Meng Zhang</a>, <a href="/search/eess?searchtype=author&query=Miao%2C+Y">Yongkang Miao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.03878v1-abstract-short" style="display: inline;"> Ground-roll wave is a common coherent noise in land field seismic data. This Rayleigh-type surface wave usually has low frequency, low apparent velocity, and high amplitude, therefore obscures the reflection events of seismic shot gathers. Commonly used techniques focus on the differences of ground-roll and reflection in transformed domain such as $f-k$ domain, wavelet domain, or curvelet domain.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.03878v1-abstract-full').style.display = 'inline'; document.getElementById('2409.03878v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.03878v1-abstract-full" style="display: none;"> Ground-roll wave is a common coherent noise in land field seismic data. This Rayleigh-type surface wave usually has low frequency, low apparent velocity, and high amplitude, therefore obscures the reflection events of seismic shot gathers. Commonly used techniques focus on the differences of ground-roll and reflection in transformed domain such as $f-k$ domain, wavelet domain, or curvelet domain. These approaches use a series of fixed atoms or bases to transform the data in time-space domain into transformed domain to separate different waveforms, thus tend to suffer from the complexity for a delicate design of the parameters of the transform domain filter. To deal with these problems, a novel way is proposed to separate ground-roll from reflections using convolutional neural network (CNN) model based method to learn to extract the features of ground-roll and reflections automatically based on training data. In the proposed method, low-pass filtered seismic data which is contaminated by ground-roll wave is used as input of CNN, and then outputs both ground-roll component and low-frequency part of reflection component simultaneously. Discriminative loss is applied together with similarity loss in the training process to enhance the similarity to their train labels as well as the difference between the two outputs. Experiments are conducted on both synthetic and real data, showing that CNN based method can separate ground roll from reflections effectively, and has generalization ability to a certain extent. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.03878v1-abstract-full').style.display = 'none'; document.getElementById('2409.03878v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.00114">arXiv:2409.00114</a> <span> [<a href="https://arxiv.org/pdf/2409.00114">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Applied Physics">physics.app-ph</span> </div> </div> <p class="title is-5 mathjax"> Terahertz Channels in Atmospheric Conditions: Propagation Characteristics and Security Performance </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ma%2C+J">Jianjun Ma</a>, <a href="/search/eess?searchtype=author&query=Song%2C+Y">Yuheng Song</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+M">Mingxia Zhang</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+G">Guohao Liu</a>, <a href="/search/eess?searchtype=author&query=Li%2C+W">Weiming Li</a>, <a href="/search/eess?searchtype=author&query=Federici%2C+J+F">John F. Federici</a>, <a href="/search/eess?searchtype=author&query=Mittleman%2C+D+M">Daniel M. Mittleman</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.00114v2-abstract-short" style="display: inline;"> With the growing demand for higher wireless data rates, the interest in extending the carrier frequency of wireless links to the terahertz (THz) range has significantly increased. For long-distance outdoor wireless communications, THz channels may suffer substantial power loss and security issues due to atmospheric weather effects. It is crucial to assess the impact of weather on high-capacity dat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00114v2-abstract-full').style.display = 'inline'; document.getElementById('2409.00114v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.00114v2-abstract-full" style="display: none;"> With the growing demand for higher wireless data rates, the interest in extending the carrier frequency of wireless links to the terahertz (THz) range has significantly increased. For long-distance outdoor wireless communications, THz channels may suffer substantial power loss and security issues due to atmospheric weather effects. It is crucial to assess the impact of weather on high-capacity data transmission to evaluate wireless system link budgets and performance accurately. In this article, we provide an insight into the propagation characteristics of THz channels under atmospheric conditions and the security aspects of THz communication systems in future applications. We conduct a comprehensive survey of our recent research and experimental findings on THz channel transmission and physical layer security, synthesizing and categorizing the state-of-the-art research in this domain. Our analysis encompasses various atmospheric phenomena, including molecular absorption, scattering effects, and turbulence, elucidating their intricate interactions with THz waves and the resultant implications for channel modeling and system design. Furthermore, we investigate the unique security challenges posed by THz communications, examining potential vulnerabilities and proposing novel countermeasures to enhance the resilience of these high-frequency systems against eavesdropping and other security threats. Finally, we discuss the challenges and limitations of such high-frequency wireless communications and provide insights into future research prospects for realizing the 6G vision, emphasizing the need for innovative solutions to overcome the atmospheric hurdles and security concerns in THz communications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00114v2-abstract-full').style.display = 'none'; document.getElementById('2409.00114v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to Fundamental Research</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.00066">arXiv:2409.00066</a> <span> [<a href="https://arxiv.org/pdf/2409.00066">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Optical Semantic Communication through Multimode Fiber: From Symbol Transmission to Sentiment Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Gao%2C+Z">Zheng Gao</a>, <a href="/search/eess?searchtype=author&query=Jiang%2C+T">Ting Jiang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+M">Mingming Zhang</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+H">Hao Wu</a>, <a href="/search/eess?searchtype=author&query=Tang%2C+M">Ming Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.00066v1-abstract-short" style="display: inline;"> We propose and validate a novel optical semantic transmission scheme using multimode fiber (MMF). By leveraging the frequency sensitivity of intermodal dispersion in MMFs, we achieve high-dimensional semantic encoding and decoding in the frequency domain. Our system maps symbols to 128 distinct frequencies spaced at 600 kHz intervals, demonstrating a seven-fold increase in capacity compared to con… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00066v1-abstract-full').style.display = 'inline'; document.getElementById('2409.00066v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.00066v1-abstract-full" style="display: none;"> We propose and validate a novel optical semantic transmission scheme using multimode fiber (MMF). By leveraging the frequency sensitivity of intermodal dispersion in MMFs, we achieve high-dimensional semantic encoding and decoding in the frequency domain. Our system maps symbols to 128 distinct frequencies spaced at 600 kHz intervals, demonstrating a seven-fold increase in capacity compared to conventional communication encoding. We further enhance spectral efficiency by implementing 4-level pulse amplitude modulation (PAM-4), achieving 9.12 bits/s/Hz without decoding errors. Additionally, we explore the application of this system for sentiment analysis using the IMDb movie review dataset. By encoding semantically similar symbols to adjacent frequencies, the system's noise tolerance is effectively improved, facilitating accurate sentiment analysis. This work highlights the potential of MMF-based semantic communication to enhance both capacity and robustness in optical communication systems, offering promising applications in bandwidth-constrained and noisy environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00066v1-abstract-full').style.display = 'none'; document.getElementById('2409.00066v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.11289">arXiv:2408.11289</a> <span> [<a href="https://arxiv.org/pdf/2408.11289">pdf</a>, <a href="https://arxiv.org/format/2408.11289">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> HMT-UNet: A hybird Mamba-Transformer Vision UNet for Medical Image Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+M">Mingya Zhang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Z">Zhihao Chen</a>, <a href="/search/eess?searchtype=author&query=Ge%2C+Y">Yiyuan Ge</a>, <a href="/search/eess?searchtype=author&query=Tao%2C+X">Xianping Tao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.11289v2-abstract-short" style="display: inline;"> In the field of medical image segmentation, models based on both CNN and Transformer have been thoroughly investigated. However, CNNs have limited modeling capabilities for long-range dependencies, making it challenging to exploit the semantic information within images fully. On the other hand, the quadratic computational complexity poses a challenge for Transformers. State Space Models (SSMs), su… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.11289v2-abstract-full').style.display = 'inline'; document.getElementById('2408.11289v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.11289v2-abstract-full" style="display: none;"> In the field of medical image segmentation, models based on both CNN and Transformer have been thoroughly investigated. However, CNNs have limited modeling capabilities for long-range dependencies, making it challenging to exploit the semantic information within images fully. On the other hand, the quadratic computational complexity poses a challenge for Transformers. State Space Models (SSMs), such as Mamba, have been recognized as a promising method. They not only demonstrate superior performance in modeling long-range interactions, but also preserve a linear computational complexity. The hybrid mechanism of SSM (State Space Model) and Transformer, after meticulous design, can enhance its capability for efficient modeling of visual features. Extensive experiments have demonstrated that integrating the self-attention mechanism into the hybrid part behind the layers of Mamba's architecture can greatly improve the modeling capacity to capture long-range spatial dependencies. In this paper, leveraging the hybrid mechanism of SSM, we propose a U-shape architecture model for medical image segmentation, named Hybird Transformer vision Mamba UNet (HTM-UNet). We conduct comprehensive experiments on the ISIC17, ISIC18, CVC-300, CVC-ClinicDB, Kvasir, CVC-ColonDB, ETIS-Larib PolypDB public datasets and ZD-LCI-GIM private dataset. The results indicate that HTM-UNet exhibits competitive performance in medical image segmentation tasks. Our code is available at https://github.com/simzhangbest/HMT-Unet. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.11289v2-abstract-full').style.display = 'none'; document.getElementById('2408.11289v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">arXiv admin note: text overlap with arXiv:2403.09157; text overlap with arXiv:2407.08083 by other authors</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.03651">arXiv:2408.03651</a> <span> [<a href="https://arxiv.org/pdf/2408.03651">pdf</a>, <a href="https://arxiv.org/format/2408.03651">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Path-SAM2: Transfer SAM2 for digital pathology semantic segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+M">Mingya Zhang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+L">Liang Wang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Z">Zhihao Chen</a>, <a href="/search/eess?searchtype=author&query=Ge%2C+Y">Yiyuan Ge</a>, <a href="/search/eess?searchtype=author&query=Tao%2C+X">Xianping Tao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.03651v2-abstract-short" style="display: inline;"> The semantic segmentation task in pathology plays an indispensable role in assisting physicians in determining the condition of tissue lesions. With the proposal of Segment Anything Model (SAM), more and more foundation models have seen rapid development in the field of image segmentation. Recently, SAM2 has garnered widespread attention in both natural image and medical image segmentation. Compar… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.03651v2-abstract-full').style.display = 'inline'; document.getElementById('2408.03651v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.03651v2-abstract-full" style="display: none;"> The semantic segmentation task in pathology plays an indispensable role in assisting physicians in determining the condition of tissue lesions. With the proposal of Segment Anything Model (SAM), more and more foundation models have seen rapid development in the field of image segmentation. Recently, SAM2 has garnered widespread attention in both natural image and medical image segmentation. Compared to SAM, it has significantly improved in terms of segmentation accuracy and generalization performance. We compared the foundational models based on SAM and found that their performance in semantic segmentation of pathological images was hardly satisfactory. In this paper, we propose Path-SAM2, which for the first time adapts the SAM2 model to cater to the task of pathological semantic segmentation. We integrate the largest pretrained vision encoder for histopathology (UNI) with the original SAM2 encoder, adding more pathology-based prior knowledge. Additionally, we introduce a learnable Kolmogorov-Arnold Networks (KAN) classification module to replace the manual prompt process. In three adenoma pathological datasets, Path-SAM2 has achieved state-of-the-art performance.This study demonstrates the great potential of adapting SAM2 to pathology image segmentation tasks. We plan to release the code and model weights for this paper at: https://github.com/simzhangbest/SAM2PATH <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.03651v2-abstract-full').style.display = 'none'; document.getElementById('2408.03651v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages , 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.20554">arXiv:2407.20554</a> <span> [<a href="https://arxiv.org/pdf/2407.20554">pdf</a>, <a href="https://arxiv.org/format/2407.20554">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Analysis of PDEs">math.AP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> An anisotropic traffic flow model with look-ahead effect for mixed autonomy traffic </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Hui%2C+S">Shouwei Hui</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+M">Michael Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.20554v1-abstract-short" style="display: inline;"> In this paper we extend the Aw-Rascle-Zhang (ARZ) non-equilibrium traffic flow model to take into account the look-ahead capability of connected and autonomous vehicles (CAVs), and the mixed flow dynamics of human driven and autonomous vehicles. The look-ahead effect of CAVs is captured by a non-local averaged density within a certain distance (the look-ahead distance). We show, using wave perturb… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.20554v1-abstract-full').style.display = 'inline'; document.getElementById('2407.20554v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.20554v1-abstract-full" style="display: none;"> In this paper we extend the Aw-Rascle-Zhang (ARZ) non-equilibrium traffic flow model to take into account the look-ahead capability of connected and autonomous vehicles (CAVs), and the mixed flow dynamics of human driven and autonomous vehicles. The look-ahead effect of CAVs is captured by a non-local averaged density within a certain distance (the look-ahead distance). We show, using wave perturbation analysis, that increased look-ahead distance loosens the stability criteria. Our numerical experiments, however, showed that a longer look-ahead distance does not necessarily lead to faster convergence to equilibrium states. We also examined the impact of spatial distributions and market penetrations of CAVs and showed that increased market penetration helps stabilizing mixed traffic while the spatial distribution of CAVs have less effect on stability. The results revealed the potential of using CAVs to stabilize traffic, and may provide qualitative insights on speed control in the mixed autonomy environment. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.20554v1-abstract-full').style.display = 'none'; document.getElementById('2407.20554v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to TRB Annual Meeting 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.18324">arXiv:2407.18324</a> <span> [<a href="https://arxiv.org/pdf/2407.18324">pdf</a>, <a href="https://arxiv.org/format/2407.18324">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computational Finance">q-fin.CP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Statistical Finance">q-fin.ST</span> </div> </div> <p class="title is-5 mathjax"> AMA-LSTM: Pioneering Robust and Fair Financial Audio Analysis for Stock Volatility Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+S">Shengkun Wang</a>, <a href="/search/eess?searchtype=author&query=Ji%2C+T">Taoran Ji</a>, <a href="/search/eess?searchtype=author&query=He%2C+J">Jianfeng He</a>, <a href="/search/eess?searchtype=author&query=Almutairi%2C+M">Mariam Almutairi</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+D">Dan Wang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+L">Linhan Wang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+M">Min Zhang</a>, <a href="/search/eess?searchtype=author&query=Lu%2C+C">Chang-Tien Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.18324v1-abstract-short" style="display: inline;"> Stock volatility prediction is an important task in the financial industry. Recent advancements in multimodal methodologies, which integrate both textual and auditory data, have demonstrated significant improvements in this domain, such as earnings calls (Earnings calls are public available and often involve the management team of a public company and interested parties to discuss the company's ea… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.18324v1-abstract-full').style.display = 'inline'; document.getElementById('2407.18324v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.18324v1-abstract-full" style="display: none;"> Stock volatility prediction is an important task in the financial industry. Recent advancements in multimodal methodologies, which integrate both textual and auditory data, have demonstrated significant improvements in this domain, such as earnings calls (Earnings calls are public available and often involve the management team of a public company and interested parties to discuss the company's earnings). However, these multimodal methods have faced two drawbacks. First, they often fail to yield reliable models and overfit the data due to their absorption of stochastic information from the stock market. Moreover, using multimodal models to predict stock volatility suffers from gender bias and lacks an efficient way to eliminate such bias. To address these aforementioned problems, we use adversarial training to generate perturbations that simulate the inherent stochasticity and bias, by creating areas resistant to random information around the input space to improve model robustness and fairness. Our comprehensive experiments on two real-world financial audio datasets reveal that this method exceeds the performance of current state-of-the-art solution. This confirms the value of adversarial training in reducing stochasticity and bias for stock volatility prediction tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.18324v1-abstract-full').style.display = 'none'; document.getElementById('2407.18324v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.11219">arXiv:2407.11219</a> <span> [<a href="https://arxiv.org/pdf/2407.11219">pdf</a>, <a href="https://arxiv.org/format/2407.11219">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> TLRN: Temporal Latent Residual Networks For Large Deformation Image Registration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wu%2C+N">Nian Wu</a>, <a href="/search/eess?searchtype=author&query=Xing%2C+J">Jiarui Xing</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+M">Miaomiao Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.11219v2-abstract-short" style="display: inline;"> This paper presents a novel approach, termed {\em Temporal Latent Residual Network (TLRN)}, to predict a sequence of deformation fields in time-series image registration. The challenge of registering time-series images often lies in the occurrence of large motions, especially when images differ significantly from a reference (e.g., the start of a cardiac cycle compared to the peak stretching phase… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.11219v2-abstract-full').style.display = 'inline'; document.getElementById('2407.11219v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.11219v2-abstract-full" style="display: none;"> This paper presents a novel approach, termed {\em Temporal Latent Residual Network (TLRN)}, to predict a sequence of deformation fields in time-series image registration. The challenge of registering time-series images often lies in the occurrence of large motions, especially when images differ significantly from a reference (e.g., the start of a cardiac cycle compared to the peak stretching phase). To achieve accurate and robust registration results, we leverage the nature of motion continuity and exploit the temporal smoothness in consecutive image frames. Our proposed TLRN highlights a temporal residual network with residual blocks carefully designed in latent deformation spaces, which are parameterized by time-sequential initial velocity fields. We treat a sequence of residual blocks over time as a dynamic training system, where each block is designed to learn the residual function between desired deformation features and current input accumulated from previous time frames. We validate the effectivenss of TLRN on both synthetic data and real-world cine cardiac magnetic resonance (CMR) image videos. Our experimental results shows that TLRN is able to achieve substantially improved registration accuracy compared to the state-of-the-art. Our code is publicly available at https://github.com/nellie689/TLRN. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.11219v2-abstract-full').style.display = 'none'; document.getElementById('2407.11219v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages. Accepted by MICCAI 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.08555">arXiv:2407.08555</a> <span> [<a href="https://arxiv.org/pdf/2407.08555">pdf</a>, <a href="https://arxiv.org/format/2407.08555">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SLoRD: Structural Low-Rank Descriptors for Shape Consistency in Vertebrae Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=You%2C+X">Xin You</a>, <a href="/search/eess?searchtype=author&query=Lou%2C+Y">Yixin Lou</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+M">Minghui Zhang</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+J">Jie Yang</a>, <a href="/search/eess?searchtype=author&query=Navab%2C+N">Nassir Navab</a>, <a href="/search/eess?searchtype=author&query=Gu%2C+Y">Yun Gu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.08555v2-abstract-short" style="display: inline;"> Automatic and precise multi-class vertebrae segmentation from CT images is crucial for various clinical applications. However, due to a lack of explicit consistency constraints, existing methods especially for single-stage methods, still suffer from the challenge of intra-vertebrae segmentation inconsistency, which refers to multiple label predictions inside a singular vertebra. For multi-stage me… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.08555v2-abstract-full').style.display = 'inline'; document.getElementById('2407.08555v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.08555v2-abstract-full" style="display: none;"> Automatic and precise multi-class vertebrae segmentation from CT images is crucial for various clinical applications. However, due to a lack of explicit consistency constraints, existing methods especially for single-stage methods, still suffer from the challenge of intra-vertebrae segmentation inconsistency, which refers to multiple label predictions inside a singular vertebra. For multi-stage methods, vertebrae detection serving as the first step, tends to be affected by the pathology and metal implants. Thus, imprecise detections cause biased patches before segmentation, which then leads to inaccurate contour delineation and inconsistent segmentation. In our work, we intend to label individual and complete binary masks to address that challenge. Specifically, a contour generation network is proposed based on Structural Low-Rank Descriptors for shape consistency, termed SLoRD. For a structural representation of vertebral contours, we adopt the spherical coordinate system and devise the spherical centroid to calculate contour descriptors. Due to vertebrae's similar appearances, basic contour descriptors can be acquired to restore original contours. Therefore, SLoRD leverages these contour priors and explicit shape constraints to facilitate regressed contour points close to vertebral surfaces. Quantitative and qualitative evaluations on VerSe 2019 and 2020 demonstrate the superior performance of our framework over other single-stage and multi-stage state-of-the-art (SOTA) methods. Further, SLoRD is a plug-and-play framework to refine the segmentation inconsistency existing in coarse predictions from other approaches. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.08555v2-abstract-full').style.display = 'none'; document.getElementById('2407.08555v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Under review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.07306">arXiv:2407.07306</a> <span> [<a href="https://arxiv.org/pdf/2407.07306">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Medical Physics">physics.med-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Electrical Impedance Tomography Based Closed-loop Tumor Treating Fields in Dynamic Lung Tumors </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+M">Minmin Wang</a>, <a href="/search/eess?searchtype=author&query=Xie%2C+X">Xu Xie</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+Y">Yuxi Guo</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+L">Liying Zhu</a>, <a href="/search/eess?searchtype=author&query=Lan%2C+Y">Yue Lan</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+H">Haitang Yang</a>, <a href="/search/eess?searchtype=author&query=Pan%2C+Y">Yun Pan</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+G">Guangdi Chen</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+S">Shaomin Zhang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+M">Maomao Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.07306v1-abstract-short" style="display: inline;"> Tumor Treating Fields (TTFields) is a non-invasive anticancer modality that utilizes alternating electric fields to disrupt cancer cell division and growth. While generally well-tolerated with minimal side effects, traditional TTFields therapy for lung tumors faces challenges due to the influence of respiratory motion. We design a novel closed-loop TTFields strategy for lung tumors by incorporatin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.07306v1-abstract-full').style.display = 'inline'; document.getElementById('2407.07306v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.07306v1-abstract-full" style="display: none;"> Tumor Treating Fields (TTFields) is a non-invasive anticancer modality that utilizes alternating electric fields to disrupt cancer cell division and growth. While generally well-tolerated with minimal side effects, traditional TTFields therapy for lung tumors faces challenges due to the influence of respiratory motion. We design a novel closed-loop TTFields strategy for lung tumors by incorporating electrical impedance tomography (EIT) for real-time respiratory phase monitoring and dynamic parameter adjustments. Furthermore, we conduct theoretical analysis to evaluate the performance of the proposed method using the lung motion model. Compared to conventional TTFields settings, we observed that variations in the electrical conductivity of lung during different respiratory phases led to a decrease in the average electric field intensity within lung tumors, transitioning from end-expiratory (1.08 V/cm) to end-inspiratory (0.87 V/cm) phases. Utilizing our proposed closed-Loop TTFields approach at the same dose setting (2400 mA, consistent with the traditional TTFields setting), we can achieve a higher and consistent average electric field strength at the tumor site (1.30 V/cm) across different respiratory stages. Our proposed closed-loop TTFields method has the potential to improved lung tumor therapy by mitigating the impact of respiratory motion. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.07306v1-abstract-full').style.display = 'none'; document.getElementById('2407.07306v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.05310">arXiv:2407.05310</a> <span> [<a href="https://arxiv.org/pdf/2407.05310">pdf</a>, <a href="https://arxiv.org/format/2407.05310">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Ternary Spike-based Neuromorphic Signal Processing System </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+S">Shuai Wang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+D">Dehao Zhang</a>, <a href="/search/eess?searchtype=author&query=Belatreche%2C+A">Ammar Belatreche</a>, <a href="/search/eess?searchtype=author&query=Xiao%2C+Y">Yichen Xiao</a>, <a href="/search/eess?searchtype=author&query=Qing%2C+H">Hongyu Qing</a>, <a href="/search/eess?searchtype=author&query=We%2C+W">Wenjie We</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+M">Malu Zhang</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+Y">Yang Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.05310v1-abstract-short" style="display: inline;"> Deep Neural Networks (DNNs) have been successfully implemented across various signal processing fields, resulting in significant enhancements in performance. However, DNNs generally require substantial computational resources, leading to significant economic costs and posing challenges for their deployment on resource-constrained edge devices. In this study, we take advantage of spiking neural net… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.05310v1-abstract-full').style.display = 'inline'; document.getElementById('2407.05310v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.05310v1-abstract-full" style="display: none;"> Deep Neural Networks (DNNs) have been successfully implemented across various signal processing fields, resulting in significant enhancements in performance. However, DNNs generally require substantial computational resources, leading to significant economic costs and posing challenges for their deployment on resource-constrained edge devices. In this study, we take advantage of spiking neural networks (SNNs) and quantization technologies to develop an energy-efficient and lightweight neuromorphic signal processing system. Our system is characterized by two principal innovations: a threshold-adaptive encoding (TAE) method and a quantized ternary SNN (QT-SNN). The TAE method can efficiently encode time-varying analog signals into sparse ternary spike trains, thereby reducing energy and memory demands for signal processing. QT-SNN, compatible with ternary spike trains from the TAE method, quantifies both membrane potentials and synaptic weights to reduce memory requirements while maintaining performance. Extensive experiments are conducted on two typical signal-processing tasks: speech and electroencephalogram recognition. The results demonstrate that our neuromorphic signal processing system achieves state-of-the-art (SOTA) performance with a 94% reduced memory requirement. Furthermore, through theoretical energy consumption analysis, our system shows 7.5x energy saving compared to other SNN works. The efficiency and efficacy of the proposed system highlight its potential as a promising avenue for energy-efficient signal processing. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.05310v1-abstract-full').style.display = 'none'; document.getElementById('2407.05310v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.18088">arXiv:2406.18088</a> <span> [<a href="https://arxiv.org/pdf/2406.18088">pdf</a>, <a href="https://arxiv.org/format/2406.18088">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.21437/Interspeech.2024-2550">10.21437/Interspeech.2024-2550 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> LLM-Driven Multimodal Opinion Expression Identification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Jia%2C+B">Bonian Jia</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+H">Huiyao Chen</a>, <a href="/search/eess?searchtype=author&query=Sun%2C+Y">Yueheng Sun</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+M">Meishan Zhang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+M">Min Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.18088v2-abstract-short" style="display: inline;"> Opinion Expression Identification (OEI) is essential in NLP for applications ranging from voice assistants to depression diagnosis. This study extends OEI to encompass multimodal inputs, underlining the significance of auditory cues in delivering emotional subtleties beyond the capabilities of text. We introduce a novel multimodal OEI (MOEI) task, integrating text and speech to mirror real-world s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.18088v2-abstract-full').style.display = 'inline'; document.getElementById('2406.18088v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.18088v2-abstract-full" style="display: none;"> Opinion Expression Identification (OEI) is essential in NLP for applications ranging from voice assistants to depression diagnosis. This study extends OEI to encompass multimodal inputs, underlining the significance of auditory cues in delivering emotional subtleties beyond the capabilities of text. We introduce a novel multimodal OEI (MOEI) task, integrating text and speech to mirror real-world scenarios. Utilizing CMU MOSEI and IEMOCAP datasets, we construct the CI-MOEI dataset. Additionally, Text-to-Speech (TTS) technology is applied to the MPQA dataset to obtain the CIM-OEI dataset. We design a template for the OEI task to take full advantage of the generative power of large language models (LLMs). Advancing further, we propose an LLM-driven method STOEI, which combines speech and text modal to identify opinion expressions. Our experiments demonstrate that MOEI significantly improves the performance while our method outperforms existing methods by 9.20\% and obtains SOTA results. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.18088v2-abstract-full').style.display = 'none'; document.getElementById('2406.18088v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 3 Figures, Accept by Interspeech 2024</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Proceedings of Interspeech 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.17784">arXiv:2406.17784</a> <span> [<a href="https://arxiv.org/pdf/2406.17784">pdf</a>, <a href="https://arxiv.org/format/2406.17784">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Scalable Near-Field Localization Based on Partitioned Large-Scale Antenna Array </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yuan%2C+X">Xiaojun Yuan</a>, <a href="/search/eess?searchtype=author&query=Zheng%2C+Y">Yuqing Zheng</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+M">Mingchen Zhang</a>, <a href="/search/eess?searchtype=author&query=Teng%2C+B">Boyu Teng</a>, <a href="/search/eess?searchtype=author&query=Jiang%2C+W">Wenjun Jiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.17784v1-abstract-short" style="display: inline;"> This paper studies a passive localization system, where an extremely large-scale antenna array (ELAA) is deployed at the base station (BS) to locate a user equipment (UE) residing in its near-field (Fresnel) region. We propose a novel algorithm, named array partitioning-based location estimation (APLE), for scalable near-field localization. The APLE algorithm is developed based on the basic assump… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.17784v1-abstract-full').style.display = 'inline'; document.getElementById('2406.17784v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.17784v1-abstract-full" style="display: none;"> This paper studies a passive localization system, where an extremely large-scale antenna array (ELAA) is deployed at the base station (BS) to locate a user equipment (UE) residing in its near-field (Fresnel) region. We propose a novel algorithm, named array partitioning-based location estimation (APLE), for scalable near-field localization. The APLE algorithm is developed based on the basic assumption that, by partitioning the ELAA into multiple subarrays, the UE can be approximated as in the far-field region of each subarray. We establish a Bayeian inference framework based on the geometric constraints between the UE location and the angles of arrivals (AoAs) at different subarrays. Then, the APLE algorithm is designed based on the message-passing principle for the localization of the UE. APLE exhibits linear computational complexity with the number of BS antennas, leading to a significant reduction in complexity compared to existing methods. We further propose an enhanced APLE (E-APLE) algorithm that refines the location estimate obtained from APLE by following the maximum likelihood principle. The E-APLE algorithm achieves superior localization accuracy compared to APLE while maintaining a linear complexity with the number of BS antennas. Numerical results demonstrate that the proposed APLE and E-APLE algorithms outperform the existing baselines in terms of localization accuracy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.17784v1-abstract-full').style.display = 'none'; document.getElementById('2406.17784v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">arXiv admin note: text overlap with arXiv:2312.12342</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.16871">arXiv:2406.16871</a> <span> [<a href="https://arxiv.org/pdf/2406.16871">pdf</a>, <a href="https://arxiv.org/format/2406.16871">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Neural network based model predictive control of voltage for a polymer electrolyte fuel cell system with constraints </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+X">Xiufei Li</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+M">Miao Yang</a>, <a href="/search/eess?searchtype=author&query=Qi%2C+Y">Yuanxin Qi</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+M">Miao Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.16871v1-abstract-short" style="display: inline;"> A fuel cell system must output a steady voltage as a power source in practical use. A neural network (NN) based model predictive control (MPC) approach is developed in this work to regulate the fuel cell output voltage with safety constraints. The developed NN MPC controller stabilizes the polymer electrolyte fuel cell system's output voltage by controlling the hydrogen and air flow rates at the s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.16871v1-abstract-full').style.display = 'inline'; document.getElementById('2406.16871v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.16871v1-abstract-full" style="display: none;"> A fuel cell system must output a steady voltage as a power source in practical use. A neural network (NN) based model predictive control (MPC) approach is developed in this work to regulate the fuel cell output voltage with safety constraints. The developed NN MPC controller stabilizes the polymer electrolyte fuel cell system's output voltage by controlling the hydrogen and air flow rates at the same time. The safety constraints regarding the hydrogen pressure limit and input change rate limit are considered. The neural network model is built to describe the system voltage and hydrogen pressure behavior. Simulation results show that the NN MPC can control the voltage at the desired value while satisfying the safety constraints under workload disturbance. The NN MPC shows a comparable performance of the MPC based on the detailed underlying system physical model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.16871v1-abstract-full').style.display = 'none'; document.getElementById('2406.16871v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.16326">arXiv:2406.16326</a> <span> [<a href="https://arxiv.org/pdf/2406.16326">pdf</a>, <a href="https://arxiv.org/format/2406.16326">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> RefXVC: Cross-Lingual Voice Conversion with Enhanced Reference Leveraging </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+M">Mingyang Zhang</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+Y">Yi Zhou</a>, <a href="/search/eess?searchtype=author&query=Ren%2C+Y">Yi Ren</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+C">Chen Zhang</a>, <a href="/search/eess?searchtype=author&query=Yin%2C+X">Xiang Yin</a>, <a href="/search/eess?searchtype=author&query=Li%2C+H">Haizhou Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.16326v1-abstract-short" style="display: inline;"> This paper proposes RefXVC, a method for cross-lingual voice conversion (XVC) that leverages reference information to improve conversion performance. Previous XVC works generally take an average speaker embedding to condition the speaker identity, which does not account for the changing timbre of speech that occurs with different pronunciations. To address this, our method uses both global and loc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.16326v1-abstract-full').style.display = 'inline'; document.getElementById('2406.16326v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.16326v1-abstract-full" style="display: none;"> This paper proposes RefXVC, a method for cross-lingual voice conversion (XVC) that leverages reference information to improve conversion performance. Previous XVC works generally take an average speaker embedding to condition the speaker identity, which does not account for the changing timbre of speech that occurs with different pronunciations. To address this, our method uses both global and local speaker embeddings to capture the timbre changes during speech conversion. Additionally, we observed a connection between timbre and pronunciation in different languages and utilized this by incorporating a timbre encoder and a pronunciation matching network into our model. Furthermore, we found that the variation in tones is not adequately reflected in a sentence, and therefore, we used multiple references to better capture the range of a speaker's voice. The proposed method outperformed existing systems in terms of both speech quality and speaker similarity, highlighting the effectiveness of leveraging reference information in cross-lingual voice conversion. The converted speech samples can be found on the website: \url{http://refxvc.dn3point.com} <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.16326v1-abstract-full').style.display = 'none'; document.getElementById('2406.16326v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Manuscript under review by TASLP</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.14186">arXiv:2406.14186</a> <span> [<a href="https://arxiv.org/pdf/2406.14186">pdf</a>, <a href="https://arxiv.org/format/2406.14186">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> CriDiff: Criss-cross Injection Diffusion Framework via Generative Pre-train for Prostate Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Liu%2C+T">Tingwei Liu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+M">Miao Zhang</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+L">Leiye Liu</a>, <a href="/search/eess?searchtype=author&query=Zhong%2C+J">Jialong Zhong</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+S">Shuyao Wang</a>, <a href="/search/eess?searchtype=author&query=Piao%2C+Y">Yongri Piao</a>, <a href="/search/eess?searchtype=author&query=Lu%2C+H">Huchuan Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.14186v1-abstract-short" style="display: inline;"> Recently, the Diffusion Probabilistic Model (DPM)-based methods have achieved substantial success in the field of medical image segmentation. However, most of these methods fail to enable the diffusion model to learn edge features and non-edge features effectively and to inject them efficiently into the diffusion backbone. Additionally, the domain gap between the images features and the diffusion… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.14186v1-abstract-full').style.display = 'inline'; document.getElementById('2406.14186v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.14186v1-abstract-full" style="display: none;"> Recently, the Diffusion Probabilistic Model (DPM)-based methods have achieved substantial success in the field of medical image segmentation. However, most of these methods fail to enable the diffusion model to learn edge features and non-edge features effectively and to inject them efficiently into the diffusion backbone. Additionally, the domain gap between the images features and the diffusion model features poses a great challenge to prostate segmentation. In this paper, we proposed CriDiff, a two-stage feature injecting framework with a Crisscross Injection Strategy (CIS) and a Generative Pre-train (GP) approach for prostate segmentation. The CIS maximizes the use of multi-level features by efficiently harnessing the complementarity of high and low-level features. To effectively learn multi-level of edge features and non-edge features, we proposed two parallel conditioners in the CIS: the Boundary Enhance Conditioner (BEC) and the Core Enhance Conditioner (CEC), which discriminatively model the image edge regions and non-edge regions, respectively. Moreover, the GP approach eases the inconsistency between the images features and the diffusion model without adding additional parameters. Extensive experiments on four benchmark datasets demonstrate the effectiveness of the proposed method and achieve state-of-the-art performance on four evaluation metrics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.14186v1-abstract-full').style.display = 'none'; document.getElementById('2406.14186v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted in MICCAI 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.13179">arXiv:2406.13179</a> <span> [<a href="https://arxiv.org/pdf/2406.13179">pdf</a>, <a href="https://arxiv.org/format/2406.13179">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Global-Local Convolution with Spiking Neural Networks for Energy-efficient Keyword Spotting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+S">Shuai Wang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+D">Dehao Zhang</a>, <a href="/search/eess?searchtype=author&query=Shi%2C+K">Kexin Shi</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yuchen Wang</a>, <a href="/search/eess?searchtype=author&query=Wei%2C+W">Wenjie Wei</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+J">Jibin Wu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+M">Malu Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.13179v1-abstract-short" style="display: inline;"> Thanks to Deep Neural Networks (DNNs), the accuracy of Keyword Spotting (KWS) has made substantial progress. However, as KWS systems are usually implemented on edge devices, energy efficiency becomes a critical requirement besides performance. Here, we take advantage of spiking neural networks' energy efficiency and propose an end-to-end lightweight KWS model. The model consists of two innovative… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.13179v1-abstract-full').style.display = 'inline'; document.getElementById('2406.13179v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.13179v1-abstract-full" style="display: none;"> Thanks to Deep Neural Networks (DNNs), the accuracy of Keyword Spotting (KWS) has made substantial progress. However, as KWS systems are usually implemented on edge devices, energy efficiency becomes a critical requirement besides performance. Here, we take advantage of spiking neural networks' energy efficiency and propose an end-to-end lightweight KWS model. The model consists of two innovative modules: 1) Global-Local Spiking Convolution (GLSC) module and 2) Bottleneck-PLIF module. Compared to the hand-crafted feature extraction methods, the GLSC module achieves speech feature extraction that is sparser, more energy-efficient, and yields better performance. The Bottleneck-PLIF module further processes the signals from GLSC with the aim to achieve higher accuracy with fewer parameters. Extensive experiments are conducted on the Google Speech Commands Dataset (V1 and V2). The results show our method achieves competitive performance among SNN-based KWS models with fewer parameters. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.13179v1-abstract-full').style.display = 'none'; document.getElementById('2406.13179v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.10844">arXiv:2406.10844</a> <span> [<a href="https://arxiv.org/pdf/2406.10844">pdf</a>, <a href="https://arxiv.org/format/2406.10844">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Multi-Scale Accent Modeling with Disentangling for Multi-Speaker Multi-Accent TTS Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhou%2C+X">Xuehao Zhou</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+M">Mingyang Zhang</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+Y">Yi Zhou</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+Z">Zhizheng Wu</a>, <a href="/search/eess?searchtype=author&query=Li%2C+H">Haizhou Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.10844v1-abstract-short" style="display: inline;"> Synthesizing speech across different accents while preserving the speaker identity is essential for various real-world customer applications. However, the individual and accurate modeling of accents and speakers in a text-to-speech (TTS) system is challenging due to the complexity of accent variations and the intrinsic entanglement between the accent and speaker identity. In this paper, we present… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.10844v1-abstract-full').style.display = 'inline'; document.getElementById('2406.10844v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.10844v1-abstract-full" style="display: none;"> Synthesizing speech across different accents while preserving the speaker identity is essential for various real-world customer applications. However, the individual and accurate modeling of accents and speakers in a text-to-speech (TTS) system is challenging due to the complexity of accent variations and the intrinsic entanglement between the accent and speaker identity. In this paper, we present a novel approach for multi-speaker multi-accent TTS synthesis, which aims to synthesize voices of multiple speakers, each with various accents. Our proposed approach employs a multi-scale accent modeling strategy to address accent variations at different levels. Specifically, we introduce both global (utterance level) and local (phoneme level) accent modeling, supervised by individual accent classifiers to capture the overall variation within accented utterances and fine-grained variations between phonemes, respectively. To control accents and speakers separately, speaker-independent accent modeling is necessary, which is achieved by adversarial training with speaker classifiers to disentangle speaker identity within the multi-scale accent modeling. Consequently, we obtain speaker-independent and accent-discriminative multi-scale embeddings as comprehensive accent features. Additionally, we propose a local accent prediction model that allows to generate accented speech directly from phoneme inputs. Extensive experiments are conducted on an accented English speech corpus. Both objective and subjective evaluations show the superiority of our proposed system compared to baselines systems. Detailed component analysis demonstrates the effectiveness of global and local accent modeling, and speaker disentanglement on multi-speaker multi-accent speech synthesis. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.10844v1-abstract-full').style.display = 'none'; document.getElementById('2406.10844v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.09317">arXiv:2406.09317</a> <span> [<a href="https://arxiv.org/pdf/2406.09317">pdf</a>, <a href="https://arxiv.org/format/2406.09317">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Common and Rare Fundus Diseases Identification Using Vision-Language Foundation Model with Knowledge of Over 400 Diseases </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+M">Meng Wang</a>, <a href="/search/eess?searchtype=author&query=Lin%2C+T">Tian Lin</a>, <a href="/search/eess?searchtype=author&query=Lin%2C+A">Aidi Lin</a>, <a href="/search/eess?searchtype=author&query=Yu%2C+K">Kai Yu</a>, <a href="/search/eess?searchtype=author&query=Peng%2C+Y">Yuanyuan Peng</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+L">Lianyu Wang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+C">Cheng Chen</a>, <a href="/search/eess?searchtype=author&query=Zou%2C+K">Ke Zou</a>, <a href="/search/eess?searchtype=author&query=Liang%2C+H">Huiyu Liang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+M">Man Chen</a>, <a href="/search/eess?searchtype=author&query=Yao%2C+X">Xue Yao</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+M">Meiqin Zhang</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+B">Binwei Huang</a>, <a href="/search/eess?searchtype=author&query=Zheng%2C+C">Chaoxin Zheng</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+P">Peixin Zhang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+W">Wei Chen</a>, <a href="/search/eess?searchtype=author&query=Luo%2C+Y">Yilong Luo</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yifan Chen</a>, <a href="/search/eess?searchtype=author&query=Xia%2C+H">Honghe Xia</a>, <a href="/search/eess?searchtype=author&query=Shi%2C+T">Tingkun Shi</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Q">Qi Zhang</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+J">Jinming Guo</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+X">Xiaolin Chen</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+J">Jingcheng Wang</a>, <a href="/search/eess?searchtype=author&query=Tham%2C+Y+C">Yih Chung Tham</a> , et al. (24 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.09317v2-abstract-short" style="display: inline;"> Previous foundation models for retinal images were pre-trained with limited disease categories and knowledge base. Here we introduce RetiZero, a vision-language foundation model that leverages knowledge from over 400 fundus diseases. To RetiZero's pre-training, we compiled 341,896 fundus images paired with text descriptions, sourced from public datasets, ophthalmic literature, and online resources… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.09317v2-abstract-full').style.display = 'inline'; document.getElementById('2406.09317v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.09317v2-abstract-full" style="display: none;"> Previous foundation models for retinal images were pre-trained with limited disease categories and knowledge base. Here we introduce RetiZero, a vision-language foundation model that leverages knowledge from over 400 fundus diseases. To RetiZero's pre-training, we compiled 341,896 fundus images paired with text descriptions, sourced from public datasets, ophthalmic literature, and online resources, encompassing a diverse range of diseases across multiple ethnicities and countries. RetiZero exhibits superior performance in several downstream tasks, including zero-shot disease recognition, image-to-image retrieval, and internal- and cross-domain disease identification. In zero-shot scenarios, RetiZero achieves Top5 accuracy scores of 0.8430 for 15 fundus diseases and 0.7561 for 52 fundus diseases. For image retrieval, it achieves Top5 scores of 0.9500 and 0.8860 for the same disease sets, respectively. Clinical evaluations show that RetiZero's Top3 zero-shot performance surpasses the average of 19 ophthalmologists from Singapore, China and the United States. Furthermore, RetiZero significantly enhances clinicians' accuracy in diagnosing fundus disease. These findings underscore the value of integrating the RetiZero foundation model into clinical settings, where a variety of fundus diseases are encountered. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.09317v2-abstract-full').style.display = 'none'; document.getElementById('2406.09317v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.07330">arXiv:2406.07330</a> <span> [<a href="https://arxiv.org/pdf/2406.07330">pdf</a>, <a href="https://arxiv.org/format/2406.07330">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> CTC-based Non-autoregressive Textless Speech-to-Speech Translation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Fang%2C+Q">Qingkai Fang</a>, <a href="/search/eess?searchtype=author&query=Ma%2C+Z">Zhengrui Ma</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+Y">Yan Zhou</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+M">Min Zhang</a>, <a href="/search/eess?searchtype=author&query=Feng%2C+Y">Yang Feng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.07330v1-abstract-short" style="display: inline;"> Direct speech-to-speech translation (S2ST) has achieved impressive translation quality, but it often faces the challenge of slow decoding due to the considerable length of speech sequences. Recently, some research has turned to non-autoregressive (NAR) models to expedite decoding, yet the translation quality typically lags behind autoregressive (AR) models significantly. In this paper, we investig… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.07330v1-abstract-full').style.display = 'inline'; document.getElementById('2406.07330v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.07330v1-abstract-full" style="display: none;"> Direct speech-to-speech translation (S2ST) has achieved impressive translation quality, but it often faces the challenge of slow decoding due to the considerable length of speech sequences. Recently, some research has turned to non-autoregressive (NAR) models to expedite decoding, yet the translation quality typically lags behind autoregressive (AR) models significantly. In this paper, we investigate the performance of CTC-based NAR models in S2ST, as these models have shown impressive results in machine translation. Experimental results demonstrate that by combining pretraining, knowledge distillation, and advanced NAR training techniques such as glancing training and non-monotonic latent alignments, CTC-based NAR models achieve translation quality comparable to the AR model, while preserving up to 26.81$\times$ decoding speedup. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.07330v1-abstract-full').style.display = 'none'; document.getElementById('2406.07330v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ACL 2024 Findings</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.7 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.07289">arXiv:2406.07289</a> <span> [<a href="https://arxiv.org/pdf/2406.07289">pdf</a>, <a href="https://arxiv.org/format/2406.07289">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Can We Achieve High-quality Direct Speech-to-Speech Translation without Parallel Speech Data? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Fang%2C+Q">Qingkai Fang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+S">Shaolei Zhang</a>, <a href="/search/eess?searchtype=author&query=Ma%2C+Z">Zhengrui Ma</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+M">Min Zhang</a>, <a href="/search/eess?searchtype=author&query=Feng%2C+Y">Yang Feng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.07289v1-abstract-short" style="display: inline;"> Recently proposed two-pass direct speech-to-speech translation (S2ST) models decompose the task into speech-to-text translation (S2TT) and text-to-speech (TTS) within an end-to-end model, yielding promising results. However, the training of these models still relies on parallel speech data, which is extremely challenging to collect. In contrast, S2TT and TTS have accumulated a large amount of data… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.07289v1-abstract-full').style.display = 'inline'; document.getElementById('2406.07289v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.07289v1-abstract-full" style="display: none;"> Recently proposed two-pass direct speech-to-speech translation (S2ST) models decompose the task into speech-to-text translation (S2TT) and text-to-speech (TTS) within an end-to-end model, yielding promising results. However, the training of these models still relies on parallel speech data, which is extremely challenging to collect. In contrast, S2TT and TTS have accumulated a large amount of data and pretrained models, which have not been fully utilized in the development of S2ST models. Inspired by this, in this paper, we first introduce a composite S2ST model named ComSpeech, which can seamlessly integrate any pretrained S2TT and TTS models into a direct S2ST model. Furthermore, to eliminate the reliance on parallel speech data, we propose a novel training method ComSpeech-ZS that solely utilizes S2TT and TTS data. It aligns representations in the latent space through contrastive learning, enabling the speech synthesis capability learned from the TTS data to generalize to S2ST in a zero-shot manner. Experimental results on the CVSS dataset show that when the parallel speech data is available, ComSpeech surpasses previous two-pass models like UnitY and Translatotron 2 in both translation quality and decoding speed. When there is no parallel speech data, ComSpeech-ZS lags behind \name by only 0.7 ASR-BLEU and outperforms the cascaded models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.07289v1-abstract-full').style.display = 'none'; document.getElementById('2406.07289v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ACL 2024 main conference. Project Page: https://ictnlp.github.io/ComSpeech-Site/</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.7 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.06937">arXiv:2406.06937</a> <span> [<a href="https://arxiv.org/pdf/2406.06937">pdf</a>, <a href="https://arxiv.org/format/2406.06937">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.18653/v1/2024.acl-long.85">10.18653/v1/2024.acl-long.85 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> A Non-autoregressive Generation Framework for End-to-End Simultaneous Speech-to-Speech Translation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ma%2C+Z">Zhengrui Ma</a>, <a href="/search/eess?searchtype=author&query=Fang%2C+Q">Qingkai Fang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+S">Shaolei Zhang</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+S">Shoutao Guo</a>, <a href="/search/eess?searchtype=author&query=Feng%2C+Y">Yang Feng</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+M">Min Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.06937v2-abstract-short" style="display: inline;"> Simultaneous translation models play a crucial role in facilitating communication. However, existing research primarily focuses on text-to-text or speech-to-text models, necessitating additional cascade components to achieve speech-to-speech translation. These pipeline methods suffer from error propagation and accumulate delays in each cascade component, resulting in reduced synchronization betwee… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.06937v2-abstract-full').style.display = 'inline'; document.getElementById('2406.06937v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.06937v2-abstract-full" style="display: none;"> Simultaneous translation models play a crucial role in facilitating communication. However, existing research primarily focuses on text-to-text or speech-to-text models, necessitating additional cascade components to achieve speech-to-speech translation. These pipeline methods suffer from error propagation and accumulate delays in each cascade component, resulting in reduced synchronization between the speaker and listener. To overcome these challenges, we propose a novel non-autoregressive generation framework for simultaneous speech translation (NAST-S2X), which integrates speech-to-text and speech-to-speech tasks into a unified end-to-end framework. We develop a non-autoregressive decoder capable of concurrently generating multiple text or acoustic unit tokens upon receiving fixed-length speech chunks. The decoder can generate blank or repeated tokens and employ CTC decoding to dynamically adjust its latency. Experimental results show that NAST-S2X outperforms state-of-the-art models in both speech-to-text and speech-to-speech tasks. It achieves high-quality simultaneous interpretation within a delay of less than 3 seconds and provides a 28 times decoding speedup in offline generation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.06937v2-abstract-full').style.display = 'none'; document.getElementById('2406.06937v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ACL 2024; Codes and demos are at https://github.com/ictnlp/NAST-S2x</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.03049">arXiv:2406.03049</a> <span> [<a href="https://arxiv.org/pdf/2406.03049">pdf</a>, <a href="https://arxiv.org/format/2406.03049">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> StreamSpeech: Simultaneous Speech-to-Speech Translation with Multi-task Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+S">Shaolei Zhang</a>, <a href="/search/eess?searchtype=author&query=Fang%2C+Q">Qingkai Fang</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+S">Shoutao Guo</a>, <a href="/search/eess?searchtype=author&query=Ma%2C+Z">Zhengrui Ma</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+M">Min Zhang</a>, <a href="/search/eess?searchtype=author&query=Feng%2C+Y">Yang Feng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.03049v1-abstract-short" style="display: inline;"> Simultaneous speech-to-speech translation (Simul-S2ST, a.k.a streaming speech translation) outputs target speech while receiving streaming speech inputs, which is critical for real-time communication. Beyond accomplishing translation between speech, Simul-S2ST requires a policy to control the model to generate corresponding target speech at the opportune moment within speech inputs, thereby posing… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.03049v1-abstract-full').style.display = 'inline'; document.getElementById('2406.03049v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.03049v1-abstract-full" style="display: none;"> Simultaneous speech-to-speech translation (Simul-S2ST, a.k.a streaming speech translation) outputs target speech while receiving streaming speech inputs, which is critical for real-time communication. Beyond accomplishing translation between speech, Simul-S2ST requires a policy to control the model to generate corresponding target speech at the opportune moment within speech inputs, thereby posing a double challenge of translation and policy. In this paper, we propose StreamSpeech, a direct Simul-S2ST model that jointly learns translation and simultaneous policy in a unified framework of multi-task learning. Adhering to a multi-task learning approach, StreamSpeech can perform offline and simultaneous speech recognition, speech translation and speech synthesis via an "All-in-One" seamless model. Experiments on CVSS benchmark demonstrate that StreamSpeech achieves state-of-the-art performance in both offline S2ST and Simul-S2ST tasks. Besides, StreamSpeech is able to present high-quality intermediate results (i.e., ASR or translation results) during simultaneous translation process, offering a more comprehensive real-time communication experience. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.03049v1-abstract-full').style.display = 'none'; document.getElementById('2406.03049v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ACL 2024 main conference, Project Page: https://ictnlp.github.io/StreamSpeech-site/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.18791">arXiv:2405.18791</a> <span> [<a href="https://arxiv.org/pdf/2405.18791">pdf</a>, <a href="https://arxiv.org/format/2405.18791">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Dynamical Systems">math.DS</span> </div> </div> <p class="title is-5 mathjax"> A new platooning model for connected and autonomous vehicles to improve string stability </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Hui%2C+S">Shouwei Hui</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+M">Michael Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.18791v2-abstract-short" style="display: inline;"> This paper presents a novel approach to coordinated vehicle platooning, where the platoon followers communicate solely with the platoon leader. A dynamic model is proposed to account for driving safety under communication delays. General linear stability results are mathematically proven, and numerical simulations are performed to analyze the impact of model parameters in two scenarios: a ring roa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.18791v2-abstract-full').style.display = 'inline'; document.getElementById('2405.18791v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.18791v2-abstract-full" style="display: none;"> This paper presents a novel approach to coordinated vehicle platooning, where the platoon followers communicate solely with the platoon leader. A dynamic model is proposed to account for driving safety under communication delays. General linear stability results are mathematically proven, and numerical simulations are performed to analyze the impact of model parameters in two scenarios: a ring road with initial disturbance and an infinite road with periodic disturbance. The simulation outcomes align with the theoretical analysis, demonstrating that the proposed "look-to-the-leader" platooning strategy significantly outperforms conventional car-following strategies, such as following one or two vehicles ahead, in terms of traffic flow stabilization. This paper introduces a new perspective on organizing platoons for autonomous vehicles, with implications for enhancing traffic stability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.18791v2-abstract-full').style.display = 'none'; document.getElementById('2405.18791v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">preprint submitted to Physica A</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.17441">arXiv:2405.17441</a> <span> [<a href="https://arxiv.org/pdf/2405.17441">pdf</a>, <a href="https://arxiv.org/format/2405.17441">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> When Large Language Models Meet Optical Networks: Paving the Way for Automation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+D">Danshi Wang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yidi Wang</a>, <a href="/search/eess?searchtype=author&query=Jiang%2C+X">Xiaotian Jiang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Y">Yao Zhang</a>, <a href="/search/eess?searchtype=author&query=Pang%2C+Y">Yue Pang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+M">Min Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.17441v2-abstract-short" style="display: inline;"> Since the advent of GPT, large language models (LLMs) have brought about revolutionary advancements in all walks of life. As a superior natural language processing (NLP) technology, LLMs have consistently achieved state-of-the-art performance on numerous areas. However, LLMs are considered to be general-purpose models for NLP tasks, which may encounter challenges when applied to complex tasks in s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.17441v2-abstract-full').style.display = 'inline'; document.getElementById('2405.17441v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.17441v2-abstract-full" style="display: none;"> Since the advent of GPT, large language models (LLMs) have brought about revolutionary advancements in all walks of life. As a superior natural language processing (NLP) technology, LLMs have consistently achieved state-of-the-art performance on numerous areas. However, LLMs are considered to be general-purpose models for NLP tasks, which may encounter challenges when applied to complex tasks in specialized fields such as optical networks. In this study, we propose a framework of LLM-empowered optical networks, facilitating intelligent control of the physical layer and efficient interaction with the application layer through an LLM-driven agent (AI-Agent) deployed in the control layer. The AI-Agent can leverage external tools and extract domain knowledge from a comprehensive resource library specifically established for optical networks. This is achieved through user input and well-crafted prompts, enabling the generation of control instructions and result representations for autonomous operation and maintenance in optical networks. To improve LLM's capability in professional fields and stimulate its potential on complex tasks, the details of performing prompt engineering, establishing domain knowledge library, and implementing complex tasks are illustrated in this study. Moreover, the proposed framework is verified on two typical tasks: network alarm analysis and network performance optimization. The good response accuracies and sematic similarities of 2,400 test situations exhibit the great potential of LLM in optical networks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.17441v2-abstract-full').style.display = 'none'; document.getElementById('2405.17441v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.04253">arXiv:2405.04253</a> <span> [<a href="https://arxiv.org/pdf/2405.04253">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Fermat Number Transform Based Chromatic Dispersion Compensation and Adaptive Equalization Algorithm </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Chen%2C+S">Siyu Chen</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+Z">Zheli Liu</a>, <a href="/search/eess?searchtype=author&query=Li%2C+W">Weihao Li</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+Z">Zihe Hu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+M">Mingming Zhang</a>, <a href="/search/eess?searchtype=author&query=Cui%2C+S">Sheng Cui</a>, <a href="/search/eess?searchtype=author&query=Tang%2C+M">Ming Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.04253v1-abstract-short" style="display: inline;"> By introducing the Fermat number transform into chromatic dispersion compensation and adaptive equalization, the computational complexity has been reduced by 68% compared with the con?ventional implementation. Experimental results validate its transmission performance with only 0.8 dB receiver sensitivity penalty in a 75 km-40 GBaud-PDM-16QAM system. </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.04253v1-abstract-full" style="display: none;"> By introducing the Fermat number transform into chromatic dispersion compensation and adaptive equalization, the computational complexity has been reduced by 68% compared with the con?ventional implementation. Experimental results validate its transmission performance with only 0.8 dB receiver sensitivity penalty in a 75 km-40 GBaud-PDM-16QAM system. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.04253v1-abstract-full').style.display = 'none'; document.getElementById('2405.04253v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.00734">arXiv:2405.00734</a> <span> [<a href="https://arxiv.org/pdf/2405.00734">pdf</a>, <a href="https://arxiv.org/format/2405.00734">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> EEG-MACS: Manifold Attention and Confidence Stratification for EEG-based Cross-Center Brain Disease Diagnosis under Unreliable Annotations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Song%2C+Z">Zhenxi Song</a>, <a href="/search/eess?searchtype=author&query=Qin%2C+R">Ruihan Qin</a>, <a href="/search/eess?searchtype=author&query=Ren%2C+H">Huixia Ren</a>, <a href="/search/eess?searchtype=author&query=Liang%2C+Z">Zhen Liang</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+Y">Yi Guo</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+M">Min Zhang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Z">Zhiguo Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.00734v2-abstract-short" style="display: inline;"> Cross-center data heterogeneity and annotation unreliability significantly challenge the intelligent diagnosis of diseases using brain signals. A notable example is the EEG-based diagnosis of neurodegenerative diseases, which features subtler abnormal neural dynamics typically observed in small-group settings. To advance this area, in this work, we introduce a transferable framework employing Mani… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.00734v2-abstract-full').style.display = 'inline'; document.getElementById('2405.00734v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.00734v2-abstract-full" style="display: none;"> Cross-center data heterogeneity and annotation unreliability significantly challenge the intelligent diagnosis of diseases using brain signals. A notable example is the EEG-based diagnosis of neurodegenerative diseases, which features subtler abnormal neural dynamics typically observed in small-group settings. To advance this area, in this work, we introduce a transferable framework employing Manifold Attention and Confidence Stratification (MACS) to diagnose neurodegenerative disorders based on EEG signals sourced from four centers with unreliable annotations. The MACS framework's effectiveness stems from these features: 1) The Augmentor generates various EEG-represented brain variants to enrich the data space; 2) The Switcher enhances the feature space for trusted samples and reduces overfitting on incorrectly labeled samples; 3) The Encoder uses the Riemannian manifold and Euclidean metrics to capture spatiotemporal variations and dynamic synchronization in EEG; 4) The Projector, equipped with dual heads, monitors consistency across multiple brain variants and ensures diagnostic accuracy; 5) The Stratifier adaptively stratifies learned samples by confidence levels throughout the training process; 6) Forward and backpropagation in MACS are constrained by confidence stratification to stabilize the learning system amid unreliable annotations. Our subject-independent experiments, conducted on both neurocognitive and movement disorders using cross-center corpora, have demonstrated superior performance compared to existing related algorithms. This work not only improves EEG-based diagnostics for cross-center and small-setting brain diseases but also offers insights into extending MACS techniques to other data analyses, tackling data heterogeneity and annotation unreliability in multimedia and multimodal content understanding. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.00734v2-abstract-full').style.display = 'none'; document.getElementById('2405.00734v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.18096">arXiv:2404.18096</a> <span> [<a href="https://arxiv.org/pdf/2404.18096">pdf</a>, <a href="https://arxiv.org/format/2404.18096">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Snake with Shifted Window: Learning to Adapt Vessel Pattern for OCTA Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Chen%2C+X">Xinrun Chen</a>, <a href="/search/eess?searchtype=author&query=Shen%2C+M">Mei Shen</a>, <a href="/search/eess?searchtype=author&query=Ning%2C+H">Haojian Ning</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+M">Mengzhan Zhang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+C">Chengliang Wang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+S">Shiying Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.18096v1-abstract-short" style="display: inline;"> Segmenting specific targets or structures in optical coherence tomography angiography (OCTA) images is fundamental for conducting further pathological studies. The retinal vascular layers are rich and intricate, and such vascular with complex shapes can be captured by the widely-studied OCTA images. In this paper, we thus study how to use OCTA images with projection vascular layers to segment reti… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.18096v1-abstract-full').style.display = 'inline'; document.getElementById('2404.18096v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.18096v1-abstract-full" style="display: none;"> Segmenting specific targets or structures in optical coherence tomography angiography (OCTA) images is fundamental for conducting further pathological studies. The retinal vascular layers are rich and intricate, and such vascular with complex shapes can be captured by the widely-studied OCTA images. In this paper, we thus study how to use OCTA images with projection vascular layers to segment retinal structures. To this end, we propose the SSW-OCTA model, which integrates the advantages of deformable convolutions suited for tubular structures and the swin-transformer for global feature extraction, adapting to the characteristics of OCTA modality images. Our model underwent testing and comparison on the OCTA-500 dataset, achieving state-of-the-art performance. The code is available at: https://github.com/ShellRedia/Snake-SWin-OCTA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.18096v1-abstract-full').style.display = 'none'; document.getElementById('2404.18096v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.17280">arXiv:2404.17280</a> <span> [<a href="https://arxiv.org/pdf/2404.17280">pdf</a>, <a href="https://arxiv.org/format/2404.17280">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Device Feature based on Graph Fourier Transformation with Logarithmic Processing For Detection of Replay Speech Attacks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=He%2C+M">Mingrui He</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+L">Longting Xu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+H">Han Wang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+M">Mingjun Zhang</a>, <a href="/search/eess?searchtype=author&query=Das%2C+R+K">Rohan Kumar Das</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.17280v1-abstract-short" style="display: inline;"> The most common spoofing attacks on automatic speaker verification systems are replay speech attacks. Detection of replay speech heavily relies on replay configuration information. Previous studies have shown that graph Fourier transform-derived features can effectively detect replay speech but ignore device and environmental noise effects. In this work, we propose a new feature, the graph frequen… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.17280v1-abstract-full').style.display = 'inline'; document.getElementById('2404.17280v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.17280v1-abstract-full" style="display: none;"> The most common spoofing attacks on automatic speaker verification systems are replay speech attacks. Detection of replay speech heavily relies on replay configuration information. Previous studies have shown that graph Fourier transform-derived features can effectively detect replay speech but ignore device and environmental noise effects. In this work, we propose a new feature, the graph frequency device cepstral coefficient, derived from the graph frequency domain using a device-related linear transformation. We also introduce two novel representations: graph frequency logarithmic coefficient and graph frequency logarithmic device coefficient. We evaluate our methods using traditional Gaussian mixture model and light convolutional neural network systems as classifiers. On the ASVspoof 2017 V2, ASVspoof 2019 physical access, and ASVspoof 2021 physical access datasets, our proposed features outperform known front-ends, demonstrating their effectiveness for replay speech detection. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.17280v1-abstract-full').style.display = 'none'; document.getElementById('2404.17280v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.08199">arXiv:2404.08199</a> <span> [<a href="https://arxiv.org/pdf/2404.08199">pdf</a>, <a href="https://arxiv.org/format/2404.08199">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TCSII.2023.3266594">10.1109/TCSII.2023.3266594 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Cepstral Analysis Based Artifact Detection, Recognition and Removal for Prefrontal EEG </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Han%2C+S">Siqi Han</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+C">Chao Zhang</a>, <a href="/search/eess?searchtype=author&query=Lei%2C+J">Jiaxin Lei</a>, <a href="/search/eess?searchtype=author&query=Han%2C+Q">Qingquan Han</a>, <a href="/search/eess?searchtype=author&query=Du%2C+Y">Yuhui Du</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+A">Anhe Wang</a>, <a href="/search/eess?searchtype=author&query=Bai%2C+S">Shuo Bai</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+M">Milin Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.08199v1-abstract-short" style="display: inline;"> This paper proposes to use cepstrum for artifact detection, recognition and removal in prefrontal EEG. This work focuses on the artifact caused by eye movement. A database containing artifact-free EEG and eye movement contaminated EEG from different subjects is established. A cepstral analysis-based feature extraction with support vector machine (SVM) based classifier is designed to identify the a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.08199v1-abstract-full').style.display = 'inline'; document.getElementById('2404.08199v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.08199v1-abstract-full" style="display: none;"> This paper proposes to use cepstrum for artifact detection, recognition and removal in prefrontal EEG. This work focuses on the artifact caused by eye movement. A database containing artifact-free EEG and eye movement contaminated EEG from different subjects is established. A cepstral analysis-based feature extraction with support vector machine (SVM) based classifier is designed to identify the artifacts from the target EEG signals. The proposed method achieves an accuracy of 99.62% on the artifact detection task and a 82.79% accuracy on the 6-category eye movement classification task. A statistical value-based artifact removal method is proposed and evaluated on a public EEG database, where an accuracy improvement of 3.46% is obtained on the 3-category emotion classification task. In order to make a confident decision of each 5s EEG segment, the algorithm requires only 0.66M multiplication operations. Compared to the state-of-the-art approaches in artifact detection and removal, the proposed method features higher detection accuracy and lower computational cost, which makes it a more suitable solution to be integrated into a real-time and artifact robust Brain-Machine Interface (BMI). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.08199v1-abstract-full').style.display = 'none'; document.getElementById('2404.08199v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 4 figures, published by TCAS-II</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> IEEE Transactions on Circuits and Systems II: Express Briefs, 2023 </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Zhang%2C+M&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Zhang%2C+M&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+M&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+M&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+M&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+M&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+M&start=250" class="pagination-link " aria-label="Page 6" aria-current="page">6 </a> </li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository