Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 235 results for author: <span class="mathjax">Huang, H</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/eess" aria-role="search"> Searching in archive <strong>eess</strong>. <a href="/search/?searchtype=author&query=Huang%2C+H">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Huang, H"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Huang%2C+H&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Huang, H"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Huang%2C+H&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Huang%2C+H&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Huang%2C+H&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Huang%2C+H&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Huang%2C+H&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Huang%2C+H&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12985">arXiv:2411.12985</a> <span> [<a href="https://arxiv.org/pdf/2411.12985">pdf</a>, <a href="https://arxiv.org/format/2411.12985">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Disco Intelligent Omni-Surfaces: 360-degree Fully-Passive Jamming Attacks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Huang%2C+H">Huan Huang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+H">Hongliang Zhang</a>, <a href="/search/eess?searchtype=author&query=Yuan%2C+J">Jide Yuan</a>, <a href="/search/eess?searchtype=author&query=Sun%2C+L">Luyao Sun</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yitian Wang</a>, <a href="/search/eess?searchtype=author&query=Mei%2C+W">Weidong Mei</a>, <a href="/search/eess?searchtype=author&query=Di%2C+B">Boya Di</a>, <a href="/search/eess?searchtype=author&query=Cai%2C+Y">Yi Cai</a>, <a href="/search/eess?searchtype=author&query=Han%2C+Z">Zhu Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12985v1-abstract-short" style="display: inline;"> Intelligent omni-surfaces (IOSs) with 360-degree electromagnetic radiation significantly improves the performance of wireless systems, while an adversarial IOS also poses a significant potential risk for physical layer security. In this paper, we propose a "DISCO" IOS (DIOS) based fully-passive jammer (FPJ) that can launch omnidirectional fully-passive jamming attacks. In the proposed DIOS-based F… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12985v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12985v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12985v1-abstract-full" style="display: none;"> Intelligent omni-surfaces (IOSs) with 360-degree electromagnetic radiation significantly improves the performance of wireless systems, while an adversarial IOS also poses a significant potential risk for physical layer security. In this paper, we propose a "DISCO" IOS (DIOS) based fully-passive jammer (FPJ) that can launch omnidirectional fully-passive jamming attacks. In the proposed DIOS-based FPJ, the interrelated refractive and reflective (R&R) coefficients of the adversarial IOS are randomly generated, acting like a "DISCO" that distributes wireless energy radiated by the base station. By introducing active channel aging (ACA) during channel coherence time, the DIOS-based FPJ can perform omnidirectional fully-passive jamming without neither jamming power nor channel knowledge of legitimate users (LUs). To characterize the impact of the DIOS-based PFJ, we derive the statistical characteristics of DIOS-jammed channels based on two widely-used IOS models, i.e., the constant-amplitude model and the variable-amplitude model. Consequently, the asymptotic analysis of the ergodic achievable sum rates under the DIOS-based omnidirectional fully-passive jamming is given based on the derived stochastic characteristics for both the two IOS models. Based on the derived analysis, the omnidirectional jamming impact of the proposed DIOS-based FPJ implemented by a constant-amplitude IOS does not depend on either the quantization number or the stochastic distribution of the DIOS coefficients, while the conclusion does not hold on when a variable-amplitude IOS is used. Numerical results based on one-bit quantization of the IOS phase shifts are provided to verify the effectiveness of the derived theoretical analysis. The proposed DIOS-based FPJ can not only launch omnidirectional fully-passive jamming, but also improve the jamming impact by about 55% at 10 dBm transmit power per LU. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12985v1-abstract-full').style.display = 'none'; document.getElementById('2411.12985v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper has been submitted to IEEE TWC for possible publication</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11886">arXiv:2411.11886</a> <span> [<a href="https://arxiv.org/pdf/2411.11886">pdf</a>, <a href="https://arxiv.org/format/2411.11886">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> How Much Data is Enough? Optimization of Data Collection for Artifact Detection in EEG Recordings </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang-N%C3%B6th%2C+L">Lu Wang-N枚th</a>, <a href="/search/eess?searchtype=author&query=Heiler%2C+P">Philipp Heiler</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+H">Hai Huang</a>, <a href="/search/eess?searchtype=author&query=Lichtenstern%2C+D">Daniel Lichtenstern</a>, <a href="/search/eess?searchtype=author&query=Reichenbach%2C+A">Alexandra Reichenbach</a>, <a href="/search/eess?searchtype=author&query=Flacke%2C+L">Luis Flacke</a>, <a href="/search/eess?searchtype=author&query=Maisch%2C+L">Linus Maisch</a>, <a href="/search/eess?searchtype=author&query=Mayer%2C+H">Helmut Mayer</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11886v2-abstract-short" style="display: inline;"> Objective. Electroencephalography (EEG) is a widely used neuroimaging technique known for its cost-effectiveness and user-friendliness. However, various artifacts, particularly biological artifacts like Electromyography (EMG) signals, lead to a poor signal-to-noise ratio, limiting the precision of analyses and applications. The currently reported EEG data cleaning performance largely depends on th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11886v2-abstract-full').style.display = 'inline'; document.getElementById('2411.11886v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11886v2-abstract-full" style="display: none;"> Objective. Electroencephalography (EEG) is a widely used neuroimaging technique known for its cost-effectiveness and user-friendliness. However, various artifacts, particularly biological artifacts like Electromyography (EMG) signals, lead to a poor signal-to-noise ratio, limiting the precision of analyses and applications. The currently reported EEG data cleaning performance largely depends on the data used for validation, and in the case of machine learning approaches, also on the data used for training. The data are typically gathered either by recruiting subjects to perform specific artifact tasks or by integrating existing datasets. Prevailing approaches, however, tend to rely on intuitive, concept-oriented data collection with minimal justification for the selection of artifacts and their quantities. Given the substantial costs associated with biological data collection and the pressing need for effective data utilization, we propose an optimization procedure for data-oriented data collection design using deep learning-based artifact detection. Approach. We apply a binary classification between artifact epochs (time intervals containing artifacts) and non-artifact epochs (time intervals containing no artifact) using three different neural architectures. Our aim is to minimize data collection efforts while preserving the cleaning efficiency. Main results. We were able to reduce the number of artifact tasks from twelve to three and decrease repetitions of isometric contraction tasks from ten to three or sometimes even just one. Significance. Our work addresses the need for effective data utilization in biological data collection, offering a systematic and dynamic quantitative approach. By providing clear justifications for the choices of artifacts and their quantity, we aim to guide future studies toward more effective and economical data collection in EEG and EMG research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11886v2-abstract-full').style.display = 'none'; document.getElementById('2411.11886v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Several changes of wording. Caption of figure 10 corrected</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11762">arXiv:2411.11762</a> <span> [<a href="https://arxiv.org/pdf/2411.11762">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> High-Speed Cornering Control and Real-Vehicle Deployment for Autonomous Electric Vehicles </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhao%2C+S">Shiyue Zhao</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+J">Junzhi Zhang</a>, <a href="/search/eess?searchtype=author&query=Masoud%2C+N">Neda Masoud</a>, <a href="/search/eess?searchtype=author&query=Jiang%2C+Y">Yuhong Jiang</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+H">Heye Huang</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+T">Tao Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11762v1-abstract-short" style="display: inline;"> Executing drift maneuvers during high-speed cornering presents significant challenges for autonomous vehicles, yet offers the potential to minimize turning time and enhance driving dynamics. While reinforcement learning (RL) has shown promising results in simulated environments, discrepancies between simulations and real-world conditions have limited its practical deployment. This study introduces… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11762v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11762v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11762v1-abstract-full" style="display: none;"> Executing drift maneuvers during high-speed cornering presents significant challenges for autonomous vehicles, yet offers the potential to minimize turning time and enhance driving dynamics. While reinforcement learning (RL) has shown promising results in simulated environments, discrepancies between simulations and real-world conditions have limited its practical deployment. This study introduces an innovative control framework that integrates trajectory optimization with drift maneuvers, aiming to improve the algorithm's adaptability for real-vehicle implementation. We leveraged Bezier-based pre-trajectory optimization to enhance rewards and optimize the controller through Twin Delayed Deep Deterministic Policy Gradient (TD3) in a simulated environment. For real-world deployment, we implement a hybrid RL-MPC fusion mechanism, , where TD3-derived maneuvers serve as primary inputs for a Model Predictive Controller (MPC). This integration enables precise real-time tracking of the optimal trajectory, with MPC providing corrective inputs to bridge the gap between simulation and reality. The efficacy of this method is validated through real-vehicle tests on consumer-grade electric vehicles, focusing on drift U-turns and drift right-angle turns. The control outcomes of these real-vehicle tests are thoroughly documented in the paper, supported by supplementary video evidence (https://youtu.be/5wp67FcpfL8). Notably, this study is the first to deploy and apply an RL-based transient drift cornering algorithm on consumer-grade electric vehicles. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11762v1-abstract-full').style.display = 'none'; document.getElementById('2411.11762v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">In the process of being submitted to the Journal of IEEE Transactions on Industrial Electronics</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11221">arXiv:2411.11221</a> <span> [<a href="https://arxiv.org/pdf/2411.11221">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Data Driven Automatic Electrical Machine Preliminary Design with Artificial Intelligence Expert Guidance </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yiwei Wang</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+T">Tao Yang</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+H">Hailin Huang</a>, <a href="/search/eess?searchtype=author&query=Zou%2C+T">Tianjie Zou</a>, <a href="/search/eess?searchtype=author&query=Li%2C+J">Jincai Li</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+N">Nuo Chen</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Z">Zhuoran Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11221v1-abstract-short" style="display: inline;"> This paper presents a data-driven electrical machine design (EMD) framework using wound-rotor synchronous generator (WRSG) as a design example. Unlike traditional preliminary EMD processes that heavily rely on expertise, this framework leverages an artificial-intelligence based expert database, to provide preliminary designs directly from user specifications. Initial data is generated using 2D fin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11221v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11221v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11221v1-abstract-full" style="display: none;"> This paper presents a data-driven electrical machine design (EMD) framework using wound-rotor synchronous generator (WRSG) as a design example. Unlike traditional preliminary EMD processes that heavily rely on expertise, this framework leverages an artificial-intelligence based expert database, to provide preliminary designs directly from user specifications. Initial data is generated using 2D finite element (FE) machine models by sweeping fundamental design variables including machine length and diameter, enabling scalable machine geometry with machine performance for each design is recorded. This data trains a Metamodel of Optimal Prognosis (MOP)-based surrogate model, which maps design variables to key performance indicators (KPIs). Once trained, guided by metaheuristic algorithms, the surrogate model can generate thousands of geometric scalable designs, covering a wide power range, forming an AI expert database to guide future preliminary design. The framework is validated with a 30kVA WRSG design case. A prebuilt WRSG database, covering power from 10 to 60kVA, is validated by FE simulation. Design No.1138 is selected from database and compared with conventional design. Results show No.1138 achieves a higher power density of 2.21 kVA/kg in just 5 seconds, compared to 2.02 kVA/kg obtained using traditional method, which take several days. The developed AI expert database also serves as a high-quality data source for further developing AI models for automatic electrical machine design. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11221v1-abstract-full').style.display = 'none'; document.getElementById('2411.11221v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10684">arXiv:2411.10684</a> <span> [<a href="https://arxiv.org/pdf/2411.10684">pdf</a>, <a href="https://arxiv.org/format/2411.10684">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> HIST-AID: Leveraging Historical Patient Reports for Enhanced Multi-Modal Automatic Diagnosis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Huang%2C+H">Haoxu Huang</a>, <a href="/search/eess?searchtype=author&query=Deniz%2C+C+M">Cem M. Deniz</a>, <a href="/search/eess?searchtype=author&query=Cho%2C+K">Kyunghyun Cho</a>, <a href="/search/eess?searchtype=author&query=Chopra%2C+S">Sumit Chopra</a>, <a href="/search/eess?searchtype=author&query=Madaan%2C+D">Divyam Madaan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10684v1-abstract-short" style="display: inline;"> Chest X-ray imaging is a widely accessible and non-invasive diagnostic tool for detecting thoracic abnormalities. While numerous AI models assist radiologists in interpreting these images, most overlook patients' historical data. To bridge this gap, we introduce Temporal MIMIC dataset, which integrates five years of patient history, including radiographic scans and reports from MIMIC-CXR and MIMIC… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10684v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10684v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10684v1-abstract-full" style="display: none;"> Chest X-ray imaging is a widely accessible and non-invasive diagnostic tool for detecting thoracic abnormalities. While numerous AI models assist radiologists in interpreting these images, most overlook patients' historical data. To bridge this gap, we introduce Temporal MIMIC dataset, which integrates five years of patient history, including radiographic scans and reports from MIMIC-CXR and MIMIC-IV, encompassing 12,221 patients and thirteen pathologies. Building on this, we present HIST-AID, a framework that enhances automatic diagnostic accuracy using historical reports. HIST-AID emulates the radiologist's comprehensive approach, leveraging historical data to improve diagnostic accuracy. Our experiments demonstrate significant improvements, with AUROC increasing by 6.56% and AUPRC by 9.51% compared to models that rely solely on radiographic scans. These gains were consistently observed across diverse demographic groups, including variations in gender, age, and racial categories. We show that while recent data boost performance, older data may reduce accuracy due to changes in patient conditions. Our work paves the potential of incorporating historical data for more reliable automatic diagnosis, providing critical support for clinical decision-making. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10684v1-abstract-full').style.display = 'none'; document.getElementById('2411.10684v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">In Proceedings of Machine Learning for Health</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.22799">arXiv:2410.22799</a> <span> [<a href="https://arxiv.org/pdf/2410.22799">pdf</a>, <a href="https://arxiv.org/format/2410.22799">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> RIS-Aided Dual-Polarized MIMO: How Large a Surface is Needed to Beat Single Polarization? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zheng%2C+Z">Zizhou Zheng</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+H">Huan Huang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+H">Hongliang Zhang</a>, <a href="/search/eess?searchtype=author&query=Swindlehurst%2C+A+L">A. Lee Swindlehurst</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.22799v1-abstract-short" style="display: inline;"> Dual-polarized (DP) multiple-input-multiple-output (MIMO) systems have been widely adopted in commercial mobile wireless communications. Such systems achieve multiplexing and diversity gain by exploiting the polarization dimension. However, existing studies have shown that the capacity of DP MIMO may not surpass that of single-polarized (SP) MIMO systems due to the cross-polarization coupling indu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22799v1-abstract-full').style.display = 'inline'; document.getElementById('2410.22799v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.22799v1-abstract-full" style="display: none;"> Dual-polarized (DP) multiple-input-multiple-output (MIMO) systems have been widely adopted in commercial mobile wireless communications. Such systems achieve multiplexing and diversity gain by exploiting the polarization dimension. However, existing studies have shown that the capacity of DP MIMO may not surpass that of single-polarized (SP) MIMO systems due to the cross-polarization coupling induced by the propagation environment. In this letter, we employ reconfigurable intelligent surfaces (RISs) to address this issue and investigate how large the surface should be to ensure a better performance for DP MIMO. Specifically, we first derive the capacities of DP and SP MIMO systems with an RIS, and then study the influence of the RIS size on the system capacity. Our analyses reveal how to deploy the RIS in a DP MIMO scenario. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22799v1-abstract-full').style.display = 'none'; document.getElementById('2410.22799v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.19415">arXiv:2410.19415</a> <span> [<a href="https://arxiv.org/pdf/2410.19415">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Integration of Communication and Computational Imaging </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yu%2C+Z">Zhenming Yu</a>, <a href="/search/eess?searchtype=author&query=Cheng%2C+L">Liming Cheng</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+H">Hongyu Huang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+W">Wei Zhang</a>, <a href="/search/eess?searchtype=author&query=Lin%2C+L">Liang Lin</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+K">Kun Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.19415v2-abstract-short" style="display: inline;"> Communication enables the expansion of human visual perception beyond the limitations of time and distance, while computational imaging overcomes the constraints of depth and breadth. Although impressive achievements have been witnessed with the two types of technologies, the occlusive information flow between the two domains is a bottleneck hindering their ulterior progression. Herein, we propose… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19415v2-abstract-full').style.display = 'inline'; document.getElementById('2410.19415v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.19415v2-abstract-full" style="display: none;"> Communication enables the expansion of human visual perception beyond the limitations of time and distance, while computational imaging overcomes the constraints of depth and breadth. Although impressive achievements have been witnessed with the two types of technologies, the occlusive information flow between the two domains is a bottleneck hindering their ulterior progression. Herein, we propose a novel framework that integrates communication and computational imaging (ICCI) to break through the inherent isolation between communication and computational imaging for remote perception. By jointly considering the sensing and transmitting of remote visual information, the ICCI framework performs a full-link information transfer optimization, aiming to minimize information loss from the generation of the information source to the execution of the final vision tasks. We conduct numerical analysis and experiments to demonstrate the ICCI framework by integrating communication systems and snapshot compressive imaging systems. Compared with straightforward combination schemes, which sequentially execute sensing and transmitting, the ICCI scheme shows greater robustness against channel noise and impairments while achieving higher data compression. Moreover, an 80 km 27-band hyperspectral video perception with a rate of 30 fps is experimentally achieved. This new ICCI remote perception paradigm offers a highefficiency solution for various real-time computer vision tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19415v2-abstract-full').style.display = 'none'; document.getElementById('2410.19415v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.17485">arXiv:2410.17485</a> <span> [<a href="https://arxiv.org/pdf/2410.17485">pdf</a>, <a href="https://arxiv.org/format/2410.17485">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> VoiceTextBlender: Augmenting Large Language Models with Speech Capabilities via Single-Stage Joint Speech-Text Supervised Fine-Tuning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Peng%2C+Y">Yifan Peng</a>, <a href="/search/eess?searchtype=author&query=Puvvada%2C+K+C">Krishna C. Puvvada</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Z">Zhehuai Chen</a>, <a href="/search/eess?searchtype=author&query=Zelasko%2C+P">Piotr Zelasko</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+H">He Huang</a>, <a href="/search/eess?searchtype=author&query=Dhawan%2C+K">Kunal Dhawan</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+K">Ke Hu</a>, <a href="/search/eess?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a>, <a href="/search/eess?searchtype=author&query=Balam%2C+J">Jagadeesh Balam</a>, <a href="/search/eess?searchtype=author&query=Ginsburg%2C+B">Boris Ginsburg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.17485v1-abstract-short" style="display: inline;"> Recent studies have augmented large language models (LLMs) with speech capabilities, leading to the development of speech language models (SpeechLMs). Earlier SpeechLMs focused on single-turn speech-based question answering (QA), where user input comprised a speech context and a text question. More recent studies have extended this to multi-turn conversations, though they often require complex, mu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17485v1-abstract-full').style.display = 'inline'; document.getElementById('2410.17485v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.17485v1-abstract-full" style="display: none;"> Recent studies have augmented large language models (LLMs) with speech capabilities, leading to the development of speech language models (SpeechLMs). Earlier SpeechLMs focused on single-turn speech-based question answering (QA), where user input comprised a speech context and a text question. More recent studies have extended this to multi-turn conversations, though they often require complex, multi-stage supervised fine-tuning (SFT) with diverse data. Another critical challenge with SpeechLMs is catastrophic forgetting-where models optimized for speech tasks suffer significant degradation in text-only performance. To mitigate these issues, we propose a novel single-stage joint speech-text SFT approach on the low-rank adaptation (LoRA) of the LLM backbone. Our joint SFT combines text-only SFT data with three types of speech-related data: speech recognition and translation, speech-based QA, and mixed-modal SFT. Compared to previous SpeechLMs with 7B or 13B parameters, our 3B model demonstrates superior performance across various speech benchmarks while preserving the original capabilities on text-only tasks. Furthermore, our model shows emergent abilities of effectively handling previously unseen prompts and tasks, including multi-turn, mixed-modal inputs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17485v1-abstract-full').style.display = 'none'; document.getElementById('2410.17485v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15895">arXiv:2410.15895</a> <span> [<a href="https://arxiv.org/pdf/2410.15895">pdf</a>, <a href="https://arxiv.org/format/2410.15895">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantum Physics">quant-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Cryogenic Control and Readout Integrated Circuits for Solid-State Quantum Computing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Lei%2C+L">Lingxiao Lei</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+H">Heng Huang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+P">Pingxing Chen</a>, <a href="/search/eess?searchtype=author&query=Deng%2C+M">Mingtang Deng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15895v2-abstract-short" style="display: inline;"> In the pursuit of quantum computing, solid-state quantum systems, particularly superconducting ones, have made remarkable advancements over the past two decades. However, achieving fault-tolerant quantum computing for next-generation applications necessitates the integration of several million qubits, which presents significant challenges in terms of interconnection complexity and latency that are… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15895v2-abstract-full').style.display = 'inline'; document.getElementById('2410.15895v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15895v2-abstract-full" style="display: none;"> In the pursuit of quantum computing, solid-state quantum systems, particularly superconducting ones, have made remarkable advancements over the past two decades. However, achieving fault-tolerant quantum computing for next-generation applications necessitates the integration of several million qubits, which presents significant challenges in terms of interconnection complexity and latency that are currently unsolvable with state-of-the-art room-temperature control and readout electronics. Recently, cryogenic integrated circuits (ICs), including CMOS radio-frequency ICs and rapid-single-flux-quantum-logic ICs, have emerged as potential alternatives to room-temperature electronics. Unlike their room-temperature counterparts, these ICs are deployed within cryostats to enhance scalability by reducing the number and length of transmission lines. Additionally, operating at cryogenic temperatures can suppress electronic noise and improve qubit control fidelity. However, for CMOS ICs specifically, circuit design uncertainties arise due to a lack of reliable models for cryogenic field effect transistors as well as issues related to severe fickle noises and power dissipation at cryogenic temperatures. This paper provides a comprehensive review of recent research on both types of cryogenic control and readout ICs but primarily focuses on the more mature CMOS technology. The discussion encompasses principles underlying control and readout techniques employed in cryogenic CMOS ICs along with their architectural designs; characterization and modeling approaches for field effect transistors under cryogenic conditions; as well as fundamental concepts pertaining to rapid single flux quantum circuits. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15895v2-abstract-full').style.display = 'none'; document.getElementById('2410.15895v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.05793">arXiv:2410.05793</a> <span> [<a href="https://arxiv.org/pdf/2410.05793">pdf</a>, <a href="https://arxiv.org/format/2410.05793">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Distributed Coordination for Multi-Vehicle Systems in the Presence of Misbehaving Vehicles </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Han%2C+D">Dongkun Han</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+Y">Yijun Huang</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+H">Hejun Huang</a>, <a href="/search/eess?searchtype=author&query=Fang%2C+T">Tianrui Fang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.05793v1-abstract-short" style="display: inline;"> The coordination problem of multi-vehicle systems is of great interests in the area of autonomous driving and multi-vehicle control. This work mainly focuses on multi-task coordination problem of a group of vehicles with a bicycle model and some specific control objectives, including collision avoidance, connectivity maintenance and convergence to desired destinations. The basic idea is to develop… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05793v1-abstract-full').style.display = 'inline'; document.getElementById('2410.05793v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.05793v1-abstract-full" style="display: none;"> The coordination problem of multi-vehicle systems is of great interests in the area of autonomous driving and multi-vehicle control. This work mainly focuses on multi-task coordination problem of a group of vehicles with a bicycle model and some specific control objectives, including collision avoidance, connectivity maintenance and convergence to desired destinations. The basic idea is to develop a proper Lyapunov-like barrier function for all tasks and a distributed controller could be built in the presence of misbehaving vehicles. Control protocols are provided for both leader vehicle and follower vehicles. The simulation results demonstrate the effectiveness of proposed method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05793v1-abstract-full').style.display = 'none'; document.getElementById('2410.05793v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 5 figures, accepted by The 15th Asia Conference on Mechanical and Aerospace Engineering (ACMAE 2024)</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 93A16 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.15905">arXiv:2409.15905</a> <span> [<a href="https://arxiv.org/pdf/2409.15905">pdf</a>, <a href="https://arxiv.org/format/2409.15905">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Boosting Code-Switching ASR with Mixture of Experts Enhanced Speech-Conditioned LLM </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+F">Fengrun Zhang</a>, <a href="/search/eess?searchtype=author&query=Geng%2C+W">Wang Geng</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+H">Hukai Huang</a>, <a href="/search/eess?searchtype=author&query=Shan%2C+Y">Yahui Shan</a>, <a href="/search/eess?searchtype=author&query=Yi%2C+C">Cheng Yi</a>, <a href="/search/eess?searchtype=author&query=Qu%2C+H">He Qu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.15905v2-abstract-short" style="display: inline;"> In this paper, we introduce a speech-conditioned Large Language Model (LLM) integrated with a Mixture of Experts (MoE) based connector to address the challenge of Code-Switching (CS) in Automatic Speech Recognition (ASR). Specifically, we propose an Insertion and Deletion of Interruption Token (IDIT) mechanism for better transfer text generation ability of LLM to speech recognition task. We also p… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15905v2-abstract-full').style.display = 'inline'; document.getElementById('2409.15905v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.15905v2-abstract-full" style="display: none;"> In this paper, we introduce a speech-conditioned Large Language Model (LLM) integrated with a Mixture of Experts (MoE) based connector to address the challenge of Code-Switching (CS) in Automatic Speech Recognition (ASR). Specifically, we propose an Insertion and Deletion of Interruption Token (IDIT) mechanism for better transfer text generation ability of LLM to speech recognition task. We also present a connecter with MoE architecture that manages multiple languages efficiently. To further enhance the collaboration of multiple experts and leverage the understanding capabilities of LLM, we propose a two-stage progressive training strategy: 1) The connector is unfrozen and trained with language-specialized experts to map speech representations to the text space. 2) The connector and LLM LoRA adaptor are trained with the proposed IDIT mechanism and all experts are activated to learn general representations. Experimental results demonstrate that our method significantly outperforms state-of-the-art models, including end-to-end and large-scale audio-language models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15905v2-abstract-full').style.display = 'none'; document.getElementById('2409.15905v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to ICASSP 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.14499">arXiv:2409.14499</a> <span> [<a href="https://arxiv.org/pdf/2409.14499">pdf</a>, <a href="https://arxiv.org/format/2409.14499">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> </div> </div> <p class="title is-5 mathjax"> A Review of Scalable and Privacy-Preserving Multi-Agent Frameworks for Distributed Energy Resources </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Huo%2C+X">Xiang Huo</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+H">Hao Huang</a>, <a href="/search/eess?searchtype=author&query=Davis%2C+K+R">Katherine R. Davis</a>, <a href="/search/eess?searchtype=author&query=Poor%2C+H+V">H. Vincent Poor</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+M">Mingxi Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.14499v2-abstract-short" style="display: inline;"> Distributed energy resources (DERs) are gaining prominence due to their advantages in improving energy efficiency, reducing carbon emissions, and enhancing grid resilience. Despite the increasing deployment, the potential of DERs has yet to be fully explored and exploited. A fundamental question restrains the management of numerous DERs in large-scale power systems, "How should DER data be securel… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14499v2-abstract-full').style.display = 'inline'; document.getElementById('2409.14499v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.14499v2-abstract-full" style="display: none;"> Distributed energy resources (DERs) are gaining prominence due to their advantages in improving energy efficiency, reducing carbon emissions, and enhancing grid resilience. Despite the increasing deployment, the potential of DERs has yet to be fully explored and exploited. A fundamental question restrains the management of numerous DERs in large-scale power systems, "How should DER data be securely processed and DER operations be efficiently optimized?" To address this question, this paper considers two critical issues, namely privacy for processing DER data and scalability in optimizing DER operations, then surveys existing and emerging solutions from a multi-agent framework perspective. In the context of scalability, this paper reviews state-of-the-art research that relies on parallel control, optimization, and learning within distributed and/or decentralized information exchange structures, while in the context of privacy, it identifies privacy preservation measures that can be synthesized into the aforementioned scalable structures. Despite research advances in these areas, challenges remain because these highly interdisciplinary studies blend a wide variety of scalable computing architectures and privacy preservation techniques from different fields, making them difficult to adapt in practice. To mitigate this issue, this paper provides a holistic review of trending strategies that orchestrate privacy and scalability for large-scale power system operations from a multi-agent perspective, particularly for DER control problems. Furthermore, this review extrapolates new approaches for future scalable, privacy-aware, and cybersecure pathways to unlock the full potential of DERs through controlling, optimizing, and learning generic multi-agent-based cyber-physical systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14499v2-abstract-full').style.display = 'none'; document.getElementById('2409.14499v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.12352">arXiv:2409.12352</a> <span> [<a href="https://arxiv.org/pdf/2409.12352">pdf</a>, <a href="https://arxiv.org/format/2409.12352">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> META-CAT: Speaker-Informed Speech Embeddings via Meta Information Concatenation for Multi-talker ASR </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+J">Jinhan Wang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+W">Weiqing Wang</a>, <a href="/search/eess?searchtype=author&query=Dhawan%2C+K">Kunal Dhawan</a>, <a href="/search/eess?searchtype=author&query=Park%2C+T">Taejin Park</a>, <a href="/search/eess?searchtype=author&query=Kim%2C+M">Myungjong Kim</a>, <a href="/search/eess?searchtype=author&query=Medennikov%2C+I">Ivan Medennikov</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+H">He Huang</a>, <a href="/search/eess?searchtype=author&query=Koluguri%2C+N">Nithin Koluguri</a>, <a href="/search/eess?searchtype=author&query=Balam%2C+J">Jagadeesh Balam</a>, <a href="/search/eess?searchtype=author&query=Ginsburg%2C+B">Boris Ginsburg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.12352v1-abstract-short" style="display: inline;"> We propose a novel end-to-end multi-talker automatic speech recognition (ASR) framework that enables both multi-speaker (MS) ASR and target-speaker (TS) ASR. Our proposed model is trained in a fully end-to-end manner, incorporating speaker supervision from a pre-trained speaker diarization module. We introduce an intuitive yet effective method for masking ASR encoder activations using output from… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12352v1-abstract-full').style.display = 'inline'; document.getElementById('2409.12352v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.12352v1-abstract-full" style="display: none;"> We propose a novel end-to-end multi-talker automatic speech recognition (ASR) framework that enables both multi-speaker (MS) ASR and target-speaker (TS) ASR. Our proposed model is trained in a fully end-to-end manner, incorporating speaker supervision from a pre-trained speaker diarization module. We introduce an intuitive yet effective method for masking ASR encoder activations using output from the speaker supervision module, a technique we term Meta-Cat (meta-information concatenation), that can be applied to both MS-ASR and TS-ASR. Our results demonstrate that the proposed architecture achieves competitive performance in both MS-ASR and TS-ASR tasks, without the need for traditional methods, such as neural mask estimation or masking at the audio or feature level. Furthermore, we demonstrate a glimpse of a unified dual-task model which can efficiently handle both MS-ASR and TS-ASR tasks. Thus, this work illustrates that a robust end-to-end multi-talker ASR framework can be implemented with a streamlined architecture, obviating the need for the complex speaker filtering mechanisms employed in previous studies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12352v1-abstract-full').style.display = 'none'; document.getElementById('2409.12352v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.11928">arXiv:2409.11928</a> <span> [<a href="https://arxiv.org/pdf/2409.11928">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Atmospheric Turbulence-Immune Free Space Optical Communication System based on Discrete-Time Analog Transmission </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Huang%2C+H">Hongyu Huang</a>, <a href="/search/eess?searchtype=author&query=Yu%2C+Z">Zhenming Yu</a>, <a href="/search/eess?searchtype=author&query=Lei%2C+Y">Yi Lei</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+W">Wei Zhang</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+Y">Yongli Zhao</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+S">Shanguo Huang</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+K">Kun Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.11928v1-abstract-short" style="display: inline;"> To effectively mitigate the influence of atmospheric turbulence, a novel discrete-time analog transmission free-space optical (DTAT-FSO) communication scheme is proposed. It directly maps information sources to discrete-time analog symbols via joint source-channel coding and modulation. Differently from traditional digital free space optical (TD-FSO) schemes, the proposed DTAT-FSO approach can aut… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.11928v1-abstract-full').style.display = 'inline'; document.getElementById('2409.11928v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.11928v1-abstract-full" style="display: none;"> To effectively mitigate the influence of atmospheric turbulence, a novel discrete-time analog transmission free-space optical (DTAT-FSO) communication scheme is proposed. It directly maps information sources to discrete-time analog symbols via joint source-channel coding and modulation. Differently from traditional digital free space optical (TD-FSO) schemes, the proposed DTAT-FSO approach can automatically adapt to the variation of the channel state, with no need to adjust the specific modulation and coding scheme. The performance of the DTAT-FSO system was evaluated in both intensity modulation/direct detection (IM/DD) and coherent FSO systems for high-resolution image transmission. The results show that the DTAT-FSO reliably transmits images at low received optical powers (ROPs) and automatically enhances quality at high ROPs, while the TD-FSO experiences cliff and leveling effects when the channel state varies. With respect to the TD-FSO scheme, the DTAT-FSO scheme improved receiver sensitivity by 2.5 dB in the IM/DD FSO system and 0.8 dB in the coherent FSO system, and it achieved superior image fidelity under the same ROP. The automatic adaptation feature and improved performance of the DTAT-FSO suggest its potential for terrestrial, airborne, and satellite optical networks, addressing challenges posed by atmospheric turbulence. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.11928v1-abstract-full').style.display = 'none'; document.getElementById('2409.11928v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.08825">arXiv:2409.08825</a> <span> [<a href="https://arxiv.org/pdf/2409.08825">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Flight Testing of Latch Valve with Lightweight LV-Servo Direct Drive Mechanism </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Huang%2C+H">Hao-Che Huang</a>, <a href="/search/eess?searchtype=author&query=Chang%2C+C">Chih-Shin Chang</a>, <a href="/search/eess?searchtype=author&query=Hsu%2C+J">Jui-Cheng Hsu</a>, <a href="/search/eess?searchtype=author&query=Wei%2C+S">Shih-Sin Wei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.08825v3-abstract-short" style="display: inline;"> In the field of rocket technology, the latch valve assumes a pivotal role in regulating the flow of fuel gases and liquids to ensure the requisite energy supply. This project endeavors to innovate by replacing the conventional step motor mechanism with a servo motor for latch valve control. The selected servo motor, boasting a more compact form factor and reduced mass, aligns seamlessly with the p… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08825v3-abstract-full').style.display = 'inline'; document.getElementById('2409.08825v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.08825v3-abstract-full" style="display: none;"> In the field of rocket technology, the latch valve assumes a pivotal role in regulating the flow of fuel gases and liquids to ensure the requisite energy supply. This project endeavors to innovate by replacing the conventional step motor mechanism with a servo motor for latch valve control. The selected servo motor, boasting a more compact form factor and reduced mass, aligns seamlessly with the project's overarching objectives. While servo motors offer myriad advantages, it is imperative to acknowledge and address the constraints of their maximum output torque to guarantee the latch valve's reliable operation. Furthermore, as a rocket ascends, it encounters significant fluctuations in internal temperature and pressure. Consequently, rigorous environmental testing becomes paramount to validate the servo motor's performance under these dynamic conditions, thus ensuring the latch valve's unwavering functionality. The primary focus of this project is the design and testing of the mechanism's performance in simulated rocket environments, achieved through the implementation of the servo motor for latch valve control. The results reveal that the servo motor demonstrated its effectiveness and reliability in controlling the latch valve under the rigorous environmental conditions of rocket flight. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08825v3-abstract-full').style.display = 'none'; document.getElementById('2409.08825v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">21 pages, 14 figures and 1 table</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 74F10 <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> J.2 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.06656">arXiv:2409.06656</a> <span> [<a href="https://arxiv.org/pdf/2409.06656">pdf</a>, <a href="https://arxiv.org/format/2409.06656">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Sortformer: Seamless Integration of Speaker Diarization and ASR by Bridging Timestamps and Tokens </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Park%2C+T">Taejin Park</a>, <a href="/search/eess?searchtype=author&query=Medennikov%2C+I">Ivan Medennikov</a>, <a href="/search/eess?searchtype=author&query=Dhawan%2C+K">Kunal Dhawan</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+W">Weiqing Wang</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+H">He Huang</a>, <a href="/search/eess?searchtype=author&query=Koluguri%2C+N+R">Nithin Rao Koluguri</a>, <a href="/search/eess?searchtype=author&query=Puvvada%2C+K+C">Krishna C. Puvvada</a>, <a href="/search/eess?searchtype=author&query=Balam%2C+J">Jagadeesh Balam</a>, <a href="/search/eess?searchtype=author&query=Ginsburg%2C+B">Boris Ginsburg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.06656v1-abstract-short" style="display: inline;"> We propose Sortformer, a novel neural model for speaker diarization, trained with unconventional objectives compared to existing end-to-end diarization models. The permutation problem in speaker diarization has long been regarded as a critical challenge. Most prior end-to-end diarization systems employ permutation invariant loss (PIL), which optimizes for the permutation that yields the lowest err… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.06656v1-abstract-full').style.display = 'inline'; document.getElementById('2409.06656v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.06656v1-abstract-full" style="display: none;"> We propose Sortformer, a novel neural model for speaker diarization, trained with unconventional objectives compared to existing end-to-end diarization models. The permutation problem in speaker diarization has long been regarded as a critical challenge. Most prior end-to-end diarization systems employ permutation invariant loss (PIL), which optimizes for the permutation that yields the lowest error. In contrast, we introduce Sort Loss, which enables a diarization model to autonomously resolve permutation, with or without PIL. We demonstrate that combining Sort Loss and PIL achieves performance competitive with state-of-the-art end-to-end diarization models trained exclusively with PIL. Crucially, we present a streamlined multispeaker ASR architecture that leverages Sortformer as a speaker supervision model, embedding speaker label estimation within the ASR encoder state using a sinusoidal kernel function. This approach resolves the speaker permutation problem through sorted objectives, effectively bridging speaker-label timestamps and speaker tokens. In our experiments, we show that the proposed multispeaker ASR architecture, enhanced with speaker supervision, improves performance via adapter techniques. Code and trained models will be made publicly available via the NVIDIA NeMo framework <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.06656v1-abstract-full').style.display = 'none'; document.getElementById('2409.06656v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.02050">arXiv:2409.02050</a> <span> [<a href="https://arxiv.org/pdf/2409.02050">pdf</a>, <a href="https://arxiv.org/format/2409.02050">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Code-Switching Speech Recognition with LID-Based Collaborative Mixture of Experts Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Huang%2C+H">Hukai Huang</a>, <a href="/search/eess?searchtype=author&query=Lin%2C+J">Jiayan Lin</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+K">Kaidi Wang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yishuang Li</a>, <a href="/search/eess?searchtype=author&query=Guan%2C+W">Wenhao Guan</a>, <a href="/search/eess?searchtype=author&query=Li%2C+L">Lin Li</a>, <a href="/search/eess?searchtype=author&query=Hong%2C+Q">Qingyang Hong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.02050v2-abstract-short" style="display: inline;"> Due to the inherent difficulty in modeling phonetic similarities across different languages, code-switching speech recognition presents a formidable challenge. This study proposes a Collaborative-MoE, a Mixture of Experts (MoE) model that leverages a collaborative mechanism among expert groups. Initially, a preceding routing network explicitly learns Language Identification (LID) tasks and selects… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.02050v2-abstract-full').style.display = 'inline'; document.getElementById('2409.02050v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.02050v2-abstract-full" style="display: none;"> Due to the inherent difficulty in modeling phonetic similarities across different languages, code-switching speech recognition presents a formidable challenge. This study proposes a Collaborative-MoE, a Mixture of Experts (MoE) model that leverages a collaborative mechanism among expert groups. Initially, a preceding routing network explicitly learns Language Identification (LID) tasks and selects experts based on acquired LID weights. This process ensures robust routing information to the MoE layer, mitigating interference from diverse language domains on expert network parameter updates. The LID weights are also employed to facilitate inter-group collaboration, enabling the integration of language-specific representations. Furthermore, within each language expert group, a gating network operates unsupervised to foster collaboration on attributes beyond language. Extensive experiments demonstrate the efficacy of our approach, achieving significant performance enhancements compared to alternative methods. Importantly, our method preserves the efficient inference capabilities characteristic of MoE models without necessitating additional pre-training. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.02050v2-abstract-full').style.display = 'none'; document.getElementById('2409.02050v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by IEEE SLT 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.01438">arXiv:2409.01438</a> <span> [<a href="https://arxiv.org/pdf/2409.01438">pdf</a>, <a href="https://arxiv.org/format/2409.01438">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Resource-Efficient Adaptation of Speech Foundation Models for Multi-Speaker ASR </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+W">Weiqing Wang</a>, <a href="/search/eess?searchtype=author&query=Dhawan%2C+K">Kunal Dhawan</a>, <a href="/search/eess?searchtype=author&query=Park%2C+T">Taejin Park</a>, <a href="/search/eess?searchtype=author&query=Puvvada%2C+K+C">Krishna C. Puvvada</a>, <a href="/search/eess?searchtype=author&query=Medennikov%2C+I">Ivan Medennikov</a>, <a href="/search/eess?searchtype=author&query=Majumdar%2C+S">Somshubra Majumdar</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+H">He Huang</a>, <a href="/search/eess?searchtype=author&query=Balam%2C+J">Jagadeesh Balam</a>, <a href="/search/eess?searchtype=author&query=Ginsburg%2C+B">Boris Ginsburg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.01438v1-abstract-short" style="display: inline;"> Speech foundation models have achieved state-of-the-art (SoTA) performance across various tasks, such as automatic speech recognition (ASR) in hundreds of languages. However, multi-speaker ASR remains a challenging task for these models due to data scarcity and sparsity. In this paper, we present approaches to enable speech foundation models to process and understand multi-speaker speech with limi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.01438v1-abstract-full').style.display = 'inline'; document.getElementById('2409.01438v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.01438v1-abstract-full" style="display: none;"> Speech foundation models have achieved state-of-the-art (SoTA) performance across various tasks, such as automatic speech recognition (ASR) in hundreds of languages. However, multi-speaker ASR remains a challenging task for these models due to data scarcity and sparsity. In this paper, we present approaches to enable speech foundation models to process and understand multi-speaker speech with limited training data. Specifically, we adapt a speech foundation model for the multi-speaker ASR task using only telephonic data. Remarkably, the adapted model also performs well on meeting data without any fine-tuning, demonstrating the generalization ability of our approach. We conduct several ablation studies to analyze the impact of different parameters and strategies on model performance. Our findings highlight the effectiveness of our methods. Results show that less parameters give better overall cpWER, which, although counter-intuitive, provides insights into adapting speech foundation models for multi-speaker ASR tasks with minimal annotated data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.01438v1-abstract-full').style.display = 'none'; document.getElementById('2409.01438v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by SLT 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.13106">arXiv:2408.13106</a> <span> [<a href="https://arxiv.org/pdf/2408.13106">pdf</a>, <a href="https://arxiv.org/format/2408.13106">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> NEST: Self-supervised Fast Conformer as All-purpose Seasoning to Speech Processing Tasks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Huang%2C+H">He Huang</a>, <a href="/search/eess?searchtype=author&query=Park%2C+T">Taejin Park</a>, <a href="/search/eess?searchtype=author&query=Dhawan%2C+K">Kunal Dhawan</a>, <a href="/search/eess?searchtype=author&query=Medennikov%2C+I">Ivan Medennikov</a>, <a href="/search/eess?searchtype=author&query=Puvvada%2C+K+C">Krishna C. Puvvada</a>, <a href="/search/eess?searchtype=author&query=Koluguri%2C+N+R">Nithin Rao Koluguri</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+W">Weiqing Wang</a>, <a href="/search/eess?searchtype=author&query=Balam%2C+J">Jagadeesh Balam</a>, <a href="/search/eess?searchtype=author&query=Ginsburg%2C+B">Boris Ginsburg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.13106v4-abstract-short" style="display: inline;"> Self-supervised learning has been proved to benefit a wide range of speech processing tasks, such as speech recognition/translation, speaker verification and diarization, etc. However, most of current approaches are computationally expensive. In this paper, we propose a simplified and more efficient self-supervised learning framework termed as NeMo Encoder for Speech Tasks (NEST). Specifically, we… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.13106v4-abstract-full').style.display = 'inline'; document.getElementById('2408.13106v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.13106v4-abstract-full" style="display: none;"> Self-supervised learning has been proved to benefit a wide range of speech processing tasks, such as speech recognition/translation, speaker verification and diarization, etc. However, most of current approaches are computationally expensive. In this paper, we propose a simplified and more efficient self-supervised learning framework termed as NeMo Encoder for Speech Tasks (NEST). Specifically, we adopt the FastConformer architecture with 8x sub-sampling rate, which is faster than Transformer or Conformer architectures. Instead of clustering-based quantization, we use fixed random projection for its simplicity and effectiveness. We also implement a generalized noisy speech augmentation that teaches the model to disentangle the main speaker from noise or other speakers. Experiments show that \model improves over existing self-supervised models and achieves new state-of-the-art performance on a variety of speech processing tasks, such as speech recognition/translation, speaker diarization, spoken language understanding, etc. Code and checkpoints will be publicly available via NVIDIA NeMo framework. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.13106v4-abstract-full').style.display = 'none'; document.getElementById('2408.13106v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.17691">arXiv:2407.17691</a> <span> [<a href="https://arxiv.org/pdf/2407.17691">pdf</a>, <a href="https://arxiv.org/format/2407.17691">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> System-Level Simulation Framework for NB-IoT: Key Features and Performance Evaluation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+S">Shutao Zhang</a>, <a href="/search/eess?searchtype=author&query=Wen%2C+W">Wenkun Wen</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+P">Peiran Wu</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+H">Hongqing Huang</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+L">Liya Zhu</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+Y">Yijia Guo</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+T">Tingting Yang</a>, <a href="/search/eess?searchtype=author&query=Xia%2C+M">Minghua Xia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.17691v2-abstract-short" style="display: inline;"> Narrowband Internet of Things (NB-IoT) is a technology specifically designated by the 3rd Generation Partnership Project (3GPP) to meet the explosive demand for massive machine-type communications (mMTC), and it is evolving to RedCap. Industrial companies have increasingly adopted NB-IoT as the solution for mMTC due to its lightweight design and comprehensive technical specifications released by 3… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.17691v2-abstract-full').style.display = 'inline'; document.getElementById('2407.17691v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.17691v2-abstract-full" style="display: none;"> Narrowband Internet of Things (NB-IoT) is a technology specifically designated by the 3rd Generation Partnership Project (3GPP) to meet the explosive demand for massive machine-type communications (mMTC), and it is evolving to RedCap. Industrial companies have increasingly adopted NB-IoT as the solution for mMTC due to its lightweight design and comprehensive technical specifications released by 3GPP. This paper presents a system-level simulation framework for NB-IoT networks to evaluate their performance. The system-level simulator is structured into four parts: initialization, pre-generation, main simulation loop, and post-processing. Additionally, three essential features are investigated to enhance coverage, support massive connections, and ensure low power consumption, respectively. Simulation results demonstrate that the cumulative distribution function curves of the signal-to-interference-and-noise ratio fully comply with industrial standards. Furthermore, the throughput performance explains how NB-IoT networks realize massive connections at the cost of data rate. This work highlights its practical utility and paves the way for developing NB-IoT networks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.17691v2-abstract-full').style.display = 'none'; document.getElementById('2407.17691v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.13930">arXiv:2407.13930</a> <span> [<a href="https://arxiv.org/pdf/2407.13930">pdf</a>, <a href="https://arxiv.org/format/2407.13930">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> RT-Pose: A 4D Radar Tensor-based 3D Human Pose Estimation and Localization Benchmark </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ho%2C+Y">Yuan-Hao Ho</a>, <a href="/search/eess?searchtype=author&query=Cheng%2C+J">Jen-Hao Cheng</a>, <a href="/search/eess?searchtype=author&query=Kuan%2C+S+Y">Sheng Yao Kuan</a>, <a href="/search/eess?searchtype=author&query=Jiang%2C+Z">Zhongyu Jiang</a>, <a href="/search/eess?searchtype=author&query=Chai%2C+W">Wenhao Chai</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+H">Hsiang-Wei Huang</a>, <a href="/search/eess?searchtype=author&query=Lin%2C+C">Chih-Lung Lin</a>, <a href="/search/eess?searchtype=author&query=Hwang%2C+J">Jenq-Neng Hwang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.13930v1-abstract-short" style="display: inline;"> Traditional methods for human localization and pose estimation (HPE), which mainly rely on RGB images as an input modality, confront substantial limitations in real-world applications due to privacy concerns. In contrast, radar-based HPE methods emerge as a promising alternative, characterized by distinctive attributes such as through-wall recognition and privacy-preserving, rendering the method m… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.13930v1-abstract-full').style.display = 'inline'; document.getElementById('2407.13930v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.13930v1-abstract-full" style="display: none;"> Traditional methods for human localization and pose estimation (HPE), which mainly rely on RGB images as an input modality, confront substantial limitations in real-world applications due to privacy concerns. In contrast, radar-based HPE methods emerge as a promising alternative, characterized by distinctive attributes such as through-wall recognition and privacy-preserving, rendering the method more conducive to practical deployments. This paper presents a Radar Tensor-based human pose (RT-Pose) dataset and an open-source benchmarking framework. The RT-Pose dataset comprises 4D radar tensors, LiDAR point clouds, and RGB images, and is collected for a total of 72k frames across 240 sequences with six different complexity-level actions. The 4D radar tensor provides raw spatio-temporal information, differentiating it from other radar point cloud-based datasets. We develop an annotation process using RGB images and LiDAR point clouds to accurately label 3D human skeletons. In addition, we propose HRRadarPose, the first single-stage architecture that extracts the high-resolution representation of 4D radar tensors in 3D space to aid human keypoint estimation. HRRadarPose outperforms previous radar-based HPE work on the RT-Pose benchmark. The overall HRRadarPose performance on the RT-Pose dataset, as reflected in a mean per joint position error (MPJPE) of 9.91cm, indicates the persistent challenges in achieving accurate HPE in complex real-world scenarios. RT-Pose is available at https://huggingface.co/datasets/uwipl/RT-Pose. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.13930v1-abstract-full').style.display = 'none'; document.getElementById('2407.13930v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ECCV 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.12870">arXiv:2407.12870</a> <span> [<a href="https://arxiv.org/pdf/2407.12870">pdf</a>, <a href="https://arxiv.org/format/2407.12870">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Revisiting Adaptive Cellular Recognition Under Domain Shifts: A Contextual Correspondence View </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Fan%2C+J">Jianan Fan</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+D">Dongnan Liu</a>, <a href="/search/eess?searchtype=author&query=Li%2C+C">Canran Li</a>, <a href="/search/eess?searchtype=author&query=Chang%2C+H">Hang Chang</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+H">Heng Huang</a>, <a href="/search/eess?searchtype=author&query=Braet%2C+F">Filip Braet</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+M">Mei Chen</a>, <a href="/search/eess?searchtype=author&query=Cai%2C+W">Weidong Cai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.12870v2-abstract-short" style="display: inline;"> Cellular nuclei recognition serves as a fundamental and essential step in the workflow of digital pathology. However, with disparate source organs and staining procedures among histology image clusters, the scanned tiles inherently conform to a non-uniform data distribution, which induces deteriorated promises for general cross-cohort usages. Despite the latest efforts leveraging domain adaptation… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.12870v2-abstract-full').style.display = 'inline'; document.getElementById('2407.12870v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.12870v2-abstract-full" style="display: none;"> Cellular nuclei recognition serves as a fundamental and essential step in the workflow of digital pathology. However, with disparate source organs and staining procedures among histology image clusters, the scanned tiles inherently conform to a non-uniform data distribution, which induces deteriorated promises for general cross-cohort usages. Despite the latest efforts leveraging domain adaptation to mitigate distributional discrepancy, those methods are subjected to modeling the morphological characteristics of each cell individually, disregarding the hierarchical latent structure and intrinsic contextual correspondences across the tumor micro-environment. In this work, we identify the importance of implicit correspondences across biological contexts for exploiting domain-invariant pathological composition and thereby propose to exploit the dependence over various biological structures for domain adaptive cellular recognition. We discover those high-level correspondences via unsupervised contextual modeling and use them as bridges to facilitate adaptation over diverse organs and stains. In addition, to further exploit the rich spatial contexts embedded amongst nuclear communities, we propose self-adaptive dynamic distillation to secure instance-aware trade-offs across different model constituents. The proposed method is extensively evaluated on a broad spectrum of cross-domain settings under miscellaneous data distribution shifts and outperforms the state-of-the-art methods by a substantial margin. Code is available at https://github.com/camwew/CellularRecognition_DA_CC. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.12870v2-abstract-full').style.display = 'none'; document.getElementById('2407.12870v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ECCV 2024 main conference</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.09807">arXiv:2407.09807</a> <span> [<a href="https://arxiv.org/pdf/2407.09807">pdf</a>, <a href="https://arxiv.org/format/2407.09807">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> CUSIDE-array: A Streaming Multi-Channel End-to-End Speech Recognition System with Realistic Evaluations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Kong%2C+X">Xiangzhu Kong</a>, <a href="/search/eess?searchtype=author&query=Ning%2C+T">Tianqi Ning</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+H">Hao Huang</a>, <a href="/search/eess?searchtype=author&query=Ou%2C+Z">Zhijian Ou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.09807v2-abstract-short" style="display: inline;"> Recently multi-channel end-to-end (ME2E) ASR systems have emerged. While streaming single-channel end-to-end ASR has been extensively studied, streaming ME2E ASR is limited in exploration. Additionally, recent studies call attention to the gap between in-distribution (ID) and out-of-distribution (OOD) tests and doing realistic evaluations. This paper focuses on two research problems: realizing str… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.09807v2-abstract-full').style.display = 'inline'; document.getElementById('2407.09807v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.09807v2-abstract-full" style="display: none;"> Recently multi-channel end-to-end (ME2E) ASR systems have emerged. While streaming single-channel end-to-end ASR has been extensively studied, streaming ME2E ASR is limited in exploration. Additionally, recent studies call attention to the gap between in-distribution (ID) and out-of-distribution (OOD) tests and doing realistic evaluations. This paper focuses on two research problems: realizing streaming ME2E ASR and improving OOD generalization. We propose the CUSIDE-array method, which integrates the recent CUSIDE methodology (Chunking, Simulating Future Context and Decoding) into the neural beamformer approach of ME2E ASR. It enables streaming processing of both front-end and back-end with a total latency of 402ms. The CUSIDE-array ME2E models are shown to achieve superior streaming results in both ID and OOD tests. Realistic evaluations confirm the advantage of CUSIDE-array in its capability to consume single-channel data to improve OOD generalization via back-end pre-training and ME2E fine-tuning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.09807v2-abstract-full').style.display = 'none'; document.getElementById('2407.09807v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted into ISCSLP 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.09094">arXiv:2407.09094</a> <span> [<a href="https://arxiv.org/pdf/2407.09094">pdf</a>, <a href="https://arxiv.org/format/2407.09094">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Beyond Image Prior: Embedding Noise Prior into Conditional Denoising Transformer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Huang%2C+Y">Yuanfei Huang</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+H">Hua Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.09094v1-abstract-short" style="display: inline;"> Existing learning-based denoising methods typically train models to generalize the image prior from large-scale datasets, suffering from the variability in noise distributions encountered in real-world scenarios. In this work, we propose a new perspective on the denoising challenge by highlighting the distinct separation between noise and image priors. This insight forms the basis for our developm… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.09094v1-abstract-full').style.display = 'inline'; document.getElementById('2407.09094v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.09094v1-abstract-full" style="display: none;"> Existing learning-based denoising methods typically train models to generalize the image prior from large-scale datasets, suffering from the variability in noise distributions encountered in real-world scenarios. In this work, we propose a new perspective on the denoising challenge by highlighting the distinct separation between noise and image priors. This insight forms the basis for our development of conditional optimization framework, designed to overcome the constraints of traditional denoising framework. To this end, we introduce a Locally Noise Prior Estimation (LoNPE) algorithm, which accurately estimates the noise prior directly from a single raw noisy image. This estimation acts as an explicit prior representation of the camera sensor's imaging environment, distinct from the image prior of scenes. Additionally, we design an auxiliary learnable LoNPE network tailored for practical application to sRGB noisy images. Leveraging the estimated noise prior, we present a novel Conditional Denoising Transformer (Condformer), by incorporating the noise prior into a conditional self-attention mechanism. This integration allows the Condformer to segment the optimization process into multiple explicit subspaces, significantly enhancing the model's generalization and flexibility. Extensive experimental evaluations on both synthetic and real-world datasets, demonstrate that the proposed method achieves superior performance over current state-of-the-art methods. The source code is available at https://github.com/YuanfeiHuang/Condformer. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.09094v1-abstract-full').style.display = 'none'; document.getElementById('2407.09094v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Under Review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.08813">arXiv:2407.08813</a> <span> [<a href="https://arxiv.org/pdf/2407.08813">pdf</a>, <a href="https://arxiv.org/format/2407.08813">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> FairDomain: Achieving Fairness in Cross-Domain Medical Image Segmentation and Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Tian%2C+Y">Yu Tian</a>, <a href="/search/eess?searchtype=author&query=Wen%2C+C">Congcong Wen</a>, <a href="/search/eess?searchtype=author&query=Shi%2C+M">Min Shi</a>, <a href="/search/eess?searchtype=author&query=Afzal%2C+M+M">Muhammad Muneeb Afzal</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+H">Hao Huang</a>, <a href="/search/eess?searchtype=author&query=Khan%2C+M+O">Muhammad Osama Khan</a>, <a href="/search/eess?searchtype=author&query=Luo%2C+Y">Yan Luo</a>, <a href="/search/eess?searchtype=author&query=Fang%2C+Y">Yi Fang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+M">Mengyu Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.08813v2-abstract-short" style="display: inline;"> Addressing fairness in artificial intelligence (AI), particularly in medical AI, is crucial for ensuring equitable healthcare outcomes. Recent efforts to enhance fairness have introduced new methodologies and datasets in medical AI. However, the fairness issue under the setting of domain transfer is almost unexplored, while it is common that clinics rely on different imaging technologies (e.g., di… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.08813v2-abstract-full').style.display = 'inline'; document.getElementById('2407.08813v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.08813v2-abstract-full" style="display: none;"> Addressing fairness in artificial intelligence (AI), particularly in medical AI, is crucial for ensuring equitable healthcare outcomes. Recent efforts to enhance fairness have introduced new methodologies and datasets in medical AI. However, the fairness issue under the setting of domain transfer is almost unexplored, while it is common that clinics rely on different imaging technologies (e.g., different retinal imaging modalities) for patient diagnosis. This paper presents FairDomain, a pioneering systemic study into algorithmic fairness under domain shifts, employing state-of-the-art domain adaptation (DA) and generalization (DG) algorithms for both medical segmentation and classification tasks to understand how biases are transferred between different domains. We also introduce a novel plug-and-play fair identity attention (FIA) module that adapts to various DA and DG algorithms to improve fairness by using self-attention to adjust feature importance based on demographic attributes. Additionally, we curate the first fairness-focused dataset with two paired imaging modalities for the same patient cohort on medical segmentation and classification tasks, to rigorously assess fairness in domain-shift scenarios. Excluding the confounding impact of demographic distribution variation between source and target domains will allow clearer quantification of the performance of domain transfer models. Our extensive evaluations reveal that the proposed FIA significantly enhances both model performance accounted for fairness across all domain shift settings (i.e., DA and DG) with respect to different demographics, which outperforms existing methods on both segmentation and classification. The code and data can be accessed at https://ophai.hms.harvard.edu/datasets/harvard-fairdomain20k. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.08813v2-abstract-full').style.display = 'none'; document.getElementById('2407.08813v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ECCV 2024; Codes and datasets are available at https://github.com/Harvard-Ophthalmology-AI-Lab/FairDomain</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.07921">arXiv:2407.07921</a> <span> [<a href="https://arxiv.org/pdf/2407.07921">pdf</a>, <a href="https://arxiv.org/format/2407.07921">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> A Trustworthy AIoT-enabled Localization System via Federated Learning and Blockchain </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+J">Junfei Wang</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+H">He Huang</a>, <a href="/search/eess?searchtype=author&query=Feng%2C+J">Jingze Feng</a>, <a href="/search/eess?searchtype=author&query=Wong%2C+S">Steven Wong</a>, <a href="/search/eess?searchtype=author&query=Xie%2C+L">Lihua Xie</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+J">Jianfei Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.07921v1-abstract-short" style="display: inline;"> There is a significant demand for indoor localization technology in smart buildings, and the most promising solution in this field is using RF sensors and fingerprinting-based methods that employ machine learning models trained on crowd-sourced user data gathered from IoT devices. However, this raises security and privacy issues in practice. Some researchers propose to use federated learning to pa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.07921v1-abstract-full').style.display = 'inline'; document.getElementById('2407.07921v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.07921v1-abstract-full" style="display: none;"> There is a significant demand for indoor localization technology in smart buildings, and the most promising solution in this field is using RF sensors and fingerprinting-based methods that employ machine learning models trained on crowd-sourced user data gathered from IoT devices. However, this raises security and privacy issues in practice. Some researchers propose to use federated learning to partially overcome privacy problems, but there still remain security concerns, e.g., single-point failure and malicious attacks. In this paper, we propose a framework named DFLoc to achieve precise 3D localization tasks while considering the following two security concerns. Particularly, we design a specialized blockchain to decentralize the framework by distributing the tasks such as model distribution and aggregation which are handled by a central server to all clients in most previous works, to address the issue of the single-point failure for a reliable and accurate indoor localization system. Moreover, we introduce an updated model verification mechanism within the blockchain to alleviate the concern of malicious node attacks. Experimental results substantiate the framework's capacity to deliver accurate 3D location predictions and its superior resistance to the impacts of single-point failure and malicious attacks when compared to conventional centralized federated learning systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.07921v1-abstract-full').style.display = 'none'; document.getElementById('2407.07921v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.19954">arXiv:2406.19954</a> <span> [<a href="https://arxiv.org/pdf/2406.19954">pdf</a>, <a href="https://arxiv.org/format/2406.19954">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> BESTOW: Efficient and Streamable Speech Language Model with the Best of Two Worlds in GPT and T5 </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Chen%2C+Z">Zhehuai Chen</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+H">He Huang</a>, <a href="/search/eess?searchtype=author&query=Hrinchuk%2C+O">Oleksii Hrinchuk</a>, <a href="/search/eess?searchtype=author&query=Puvvada%2C+K+C">Krishna C. Puvvada</a>, <a href="/search/eess?searchtype=author&query=Koluguri%2C+N+R">Nithin Rao Koluguri</a>, <a href="/search/eess?searchtype=author&query=%C5%BBelasko%2C+P">Piotr 呕elasko</a>, <a href="/search/eess?searchtype=author&query=Balam%2C+J">Jagadeesh Balam</a>, <a href="/search/eess?searchtype=author&query=Ginsburg%2C+B">Boris Ginsburg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.19954v1-abstract-short" style="display: inline;"> Incorporating speech understanding capabilities into pretrained large-language models has become a vital research direction (SpeechLLM). The previous architectures can be categorized as: i) GPT-style, prepend speech prompts to the text prompts as a sequence of LLM inputs like a decoder-only model; ii) T5-style, introduce speech cross-attention to each layer of the pretrained LLMs. We propose BESTO… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.19954v1-abstract-full').style.display = 'inline'; document.getElementById('2406.19954v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.19954v1-abstract-full" style="display: none;"> Incorporating speech understanding capabilities into pretrained large-language models has become a vital research direction (SpeechLLM). The previous architectures can be categorized as: i) GPT-style, prepend speech prompts to the text prompts as a sequence of LLM inputs like a decoder-only model; ii) T5-style, introduce speech cross-attention to each layer of the pretrained LLMs. We propose BESTOW architecture to bring the BESt features from TwO Worlds into a single model that is highly efficient and has strong multitask capabilities. Moreover, there is no clear streaming solution for either style, especially considering the solution should generalize to speech multitask. We reformulate streamable SpeechLLM as a read-write policy problem and unifies the offline and streaming research with BESTOW architecture. Hence we demonstrate the first open-source SpeechLLM solution that enables Streaming and Multitask at scale (beyond ASR) at the same time. This streamable solution achieves very strong performance on a wide range of speech tasks (ASR, AST, SQA, unseen DynamicSuperb). It is end-to-end optimizable, with lower training/inference cost, and demonstrates LLM knowledge transferability to speech. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.19954v1-abstract-full').style.display = 'none'; document.getElementById('2406.19954v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68T10 <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.7 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.19674">arXiv:2406.19674</a> <span> [<a href="https://arxiv.org/pdf/2406.19674">pdf</a>, <a href="https://arxiv.org/format/2406.19674">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Less is More: Accurate Speech Recognition & Translation without Web-Scale Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Puvvada%2C+K+C">Krishna C. Puvvada</a>, <a href="/search/eess?searchtype=author&query=%C5%BBelasko%2C+P">Piotr 呕elasko</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+H">He Huang</a>, <a href="/search/eess?searchtype=author&query=Hrinchuk%2C+O">Oleksii Hrinchuk</a>, <a href="/search/eess?searchtype=author&query=Koluguri%2C+N+R">Nithin Rao Koluguri</a>, <a href="/search/eess?searchtype=author&query=Dhawan%2C+K">Kunal Dhawan</a>, <a href="/search/eess?searchtype=author&query=Majumdar%2C+S">Somshubra Majumdar</a>, <a href="/search/eess?searchtype=author&query=Rastorgueva%2C+E">Elena Rastorgueva</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Z">Zhehuai Chen</a>, <a href="/search/eess?searchtype=author&query=Lavrukhin%2C+V">Vitaly Lavrukhin</a>, <a href="/search/eess?searchtype=author&query=Balam%2C+J">Jagadeesh Balam</a>, <a href="/search/eess?searchtype=author&query=Ginsburg%2C+B">Boris Ginsburg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.19674v1-abstract-short" style="display: inline;"> Recent advances in speech recognition and translation rely on hundreds of thousands of hours of Internet speech data. We argue that state-of-the art accuracy can be reached without relying on web-scale data. Canary - multilingual ASR and speech translation model, outperforms current state-of-the-art models - Whisper, OWSM, and Seamless-M4T on English, French, Spanish, and German languages, while b… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.19674v1-abstract-full').style.display = 'inline'; document.getElementById('2406.19674v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.19674v1-abstract-full" style="display: none;"> Recent advances in speech recognition and translation rely on hundreds of thousands of hours of Internet speech data. We argue that state-of-the art accuracy can be reached without relying on web-scale data. Canary - multilingual ASR and speech translation model, outperforms current state-of-the-art models - Whisper, OWSM, and Seamless-M4T on English, French, Spanish, and German languages, while being trained on an order of magnitude less data than these models. Three key factors enables such data-efficient model: (1) a FastConformer-based attention encoder-decoder architecture (2) training on synthetic data generated with machine translation and (3) advanced training techniques: data-balancing, dynamic data blending, dynamic bucketing and noise-robust fine-tuning. The model, weights, and training code will be open-sourced. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.19674v1-abstract-full').style.display = 'none'; document.getElementById('2406.19674v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at Interspeech-2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.18871">arXiv:2406.18871</a> <span> [<a href="https://arxiv.org/pdf/2406.18871">pdf</a>, <a href="https://arxiv.org/format/2406.18871">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> DeSTA: Enhancing Speech Language Models through Descriptive Speech-Text Alignment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Lu%2C+K">Ke-Han Lu</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Z">Zhehuai Chen</a>, <a href="/search/eess?searchtype=author&query=Fu%2C+S">Szu-Wei Fu</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+H">He Huang</a>, <a href="/search/eess?searchtype=author&query=Ginsburg%2C+B">Boris Ginsburg</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y+F">Yu-Chiang Frank Wang</a>, <a href="/search/eess?searchtype=author&query=Lee%2C+H">Hung-yi Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.18871v1-abstract-short" style="display: inline;"> Recent speech language models (SLMs) typically incorporate pre-trained speech models to extend the capabilities from large language models (LLMs). In this paper, we propose a Descriptive Speech-Text Alignment approach that leverages speech captioning to bridge the gap between speech and text modalities, enabling SLMs to interpret and generate comprehensive natural language descriptions, thereby fa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.18871v1-abstract-full').style.display = 'inline'; document.getElementById('2406.18871v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.18871v1-abstract-full" style="display: none;"> Recent speech language models (SLMs) typically incorporate pre-trained speech models to extend the capabilities from large language models (LLMs). In this paper, we propose a Descriptive Speech-Text Alignment approach that leverages speech captioning to bridge the gap between speech and text modalities, enabling SLMs to interpret and generate comprehensive natural language descriptions, thereby facilitating the capability to understand both linguistic and non-linguistic features in speech. Enhanced with the proposed approach, our model demonstrates superior performance on the Dynamic-SUPERB benchmark, particularly in generalizing to unseen tasks. Moreover, we discover that the aligned model exhibits a zero-shot instruction-following capability without explicit speech instruction tuning. These findings highlight the potential to reshape instruction-following SLMs by incorporating rich, descriptive speech captions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.18871v1-abstract-full').style.display = 'none'; document.getElementById('2406.18871v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to Interspeech 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.18018">arXiv:2406.18018</a> <span> [<a href="https://arxiv.org/pdf/2406.18018">pdf</a>, <a href="https://arxiv.org/format/2406.18018">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> A Cross Spatio-Temporal Pathology-based Lung Nodule Dataset </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Jian%2C+M">Muwei Jian</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+H">Haoran Zhang</a>, <a href="/search/eess?searchtype=author&query=Shao%2C+M">Mingju Shao</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+H">Hongyu Chen</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+H">Huihui Huang</a>, <a href="/search/eess?searchtype=author&query=Zhong%2C+Y">Yanjie Zhong</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+C">Changlei Zhang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+B">Bin Wang</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+P">Penghui Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.18018v1-abstract-short" style="display: inline;"> Recently, intelligent analysis of lung nodules with the assistant of computer aided detection (CAD) techniques can improve the accuracy rate of lung cancer diagnosis. However, existing CAD systems and pulmonary datasets mainly focus on Computed Tomography (CT) images from one single period, while ignoring the cross spatio-temporal features associated with the progression of nodules contained in im… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.18018v1-abstract-full').style.display = 'inline'; document.getElementById('2406.18018v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.18018v1-abstract-full" style="display: none;"> Recently, intelligent analysis of lung nodules with the assistant of computer aided detection (CAD) techniques can improve the accuracy rate of lung cancer diagnosis. However, existing CAD systems and pulmonary datasets mainly focus on Computed Tomography (CT) images from one single period, while ignoring the cross spatio-temporal features associated with the progression of nodules contained in imaging data from various captured periods of lung cancer. If the evolution patterns of nodules across various periods in the patients' CT sequences can be explored, it will play a crucial role in guiding the precise screening identification of lung cancer. Therefore, a cross spatio-temporal lung nodule dataset with pathological information for nodule identification and diagnosis is constructed, which contains 328 CT sequences and 362 annotated nodules from 109 patients. This comprehensive database is intended to drive research in the field of CAD towards more practical and robust methods, and also contribute to the further exploration of precision medicine related field. To ensure patient confidentiality, we have removed sensitive information from the dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.18018v1-abstract-full').style.display = 'none'; document.getElementById('2406.18018v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.02166">arXiv:2406.02166</a> <span> [<a href="https://arxiv.org/pdf/2406.02166">pdf</a>, <a href="https://arxiv.org/format/2406.02166">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Whistle: Data-Efficient Multilingual and Crosslingual Speech Recognition via Weakly Phonetic Supervision </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yusuyin%2C+S">Saierdaer Yusuyin</a>, <a href="/search/eess?searchtype=author&query=Ma%2C+T">Te Ma</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+H">Hao Huang</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+W">Wenbo Zhao</a>, <a href="/search/eess?searchtype=author&query=Ou%2C+Z">Zhijian Ou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.02166v1-abstract-short" style="display: inline;"> There exist three approaches for multilingual and crosslingual automatic speech recognition (MCL-ASR) - supervised pre-training with phonetic or graphemic transcription, and self-supervised pre-training. We find that pre-training with phonetic supervision has been underappreciated so far for MCL-ASR, while conceptually it is more advantageous for information sharing between different languages. Th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.02166v1-abstract-full').style.display = 'inline'; document.getElementById('2406.02166v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.02166v1-abstract-full" style="display: none;"> There exist three approaches for multilingual and crosslingual automatic speech recognition (MCL-ASR) - supervised pre-training with phonetic or graphemic transcription, and self-supervised pre-training. We find that pre-training with phonetic supervision has been underappreciated so far for MCL-ASR, while conceptually it is more advantageous for information sharing between different languages. This paper explores the approach of pre-training with weakly phonetic supervision towards data-efficient MCL-ASR, which is called Whistle. We relax the requirement of gold-standard human-validated phonetic transcripts, and obtain International Phonetic Alphabet (IPA) based transcription by leveraging the LanguageNet grapheme-to-phoneme (G2P) models. We construct a common experimental setup based on the CommonVoice dataset, called CV-Lang10, with 10 seen languages and 2 unseen languages. A set of experiments are conducted on CV-Lang10 to compare, as fair as possible, the three approaches under the common setup for MCL-ASR. Experiments demonstrate the advantages of phoneme-based models (Whistle) for MCL-ASR, in terms of speech recognition for seen languages, crosslingual performance for unseen languages with different amounts of few-shot data, overcoming catastrophic forgetting, and training efficiency.It is found that when training data is more limited, phoneme supervision can achieve better results compared to subword supervision and self-supervision, thereby providing higher data-efficiency. To support reproducibility and promote future research along this direction, we will release the code, models and data for the whole pipeline of Whistle at https://github.com/thu-spmi/CAT upon publication. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.02166v1-abstract-full').style.display = 'none'; document.getElementById('2406.02166v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.01205">arXiv:2406.01205</a> <span> [<a href="https://arxiv.org/pdf/2406.01205">pdf</a>, <a href="https://arxiv.org/format/2406.01205">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> ControlSpeech: Towards Simultaneous Zero-shot Speaker Cloning and Zero-shot Language Style Control With Decoupled Codec </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ji%2C+S">Shengpeng Ji</a>, <a href="/search/eess?searchtype=author&query=Zuo%2C+J">Jialong Zuo</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+W">Wen Wang</a>, <a href="/search/eess?searchtype=author&query=Fang%2C+M">Minghui Fang</a>, <a href="/search/eess?searchtype=author&query=Zheng%2C+S">Siqi Zheng</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Q">Qian Chen</a>, <a href="/search/eess?searchtype=author&query=Jiang%2C+Z">Ziyue Jiang</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+H">Hai Huang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Z">Zehan Wang</a>, <a href="/search/eess?searchtype=author&query=Cheng%2C+X">Xize Cheng</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+Z">Zhou Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.01205v2-abstract-short" style="display: inline;"> In this paper, we present ControlSpeech, a text-to-speech (TTS) system capable of fully cloning the speaker's voice and enabling arbitrary control and adjustment of speaking style, merely based on a few seconds of audio prompt and a simple textual style description prompt. Prior zero-shot TTS models and controllable TTS models either could only mimic the speaker's voice without further control and… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.01205v2-abstract-full').style.display = 'inline'; document.getElementById('2406.01205v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.01205v2-abstract-full" style="display: none;"> In this paper, we present ControlSpeech, a text-to-speech (TTS) system capable of fully cloning the speaker's voice and enabling arbitrary control and adjustment of speaking style, merely based on a few seconds of audio prompt and a simple textual style description prompt. Prior zero-shot TTS models and controllable TTS models either could only mimic the speaker's voice without further control and adjustment capabilities or were unrelated to speaker-specific voice generation. Therefore, ControlSpeech focuses on a more challenging new task-a TTS system with controllable timbre, content, and style at the same time. ControlSpeech takes speech prompts, content prompts, and style prompts as inputs and utilizes bidirectional attention and mask-based parallel decoding to capture corresponding codec representations in a discrete decoupling codec space. Moreover, we discovered the issue of text style controllability in a many-to-many mapping fashion and proposed the Style Mixture Semantic Density (SMSD) model to resolve this problem. SMSD module which is based on Gaussian mixture density networks, is designed to enhance the fine-grained partitioning and sampling capabilities of style semantic information and generate speech with more diverse styles. In terms of experiments, we make available a controllable model toolkit called ControlToolkit with a new style controllable dataset, some replicated baseline models and propose new metrics to evaluate both the control capability and the quality of generated audio in ControlSpeech. The relevant ablation studies validate the necessity of each component in ControlSpeech is necessary. We hope that ControlSpeech can establish the next foundation paradigm of controllable speech synthesis. The relevant code and demo are available at https://github.com/jishengpeng/ControlSpeech . <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.01205v2-abstract-full').style.display = 'none'; document.getElementById('2406.01205v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.00683">arXiv:2406.00683</a> <span> [<a href="https://arxiv.org/pdf/2406.00683">pdf</a>, <a href="https://arxiv.org/format/2406.00683">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> Exploiting Frequency Correlation for Hyperspectral Image Reconstruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yan%2C+M">Muge Yan</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+L">Lizhi Wang</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+L">Lin Zhu</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+H">Hua Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.00683v1-abstract-short" style="display: inline;"> Deep priors have emerged as potent methods in hyperspectral image (HSI) reconstruction. While most methods emphasize space-domain learning using image space priors like non-local similarity, frequency-domain learning using image frequency priors remains neglected, limiting the reconstruction capability of networks. In this paper, we first propose a Hyperspectral Frequency Correlation (HFC) prior r… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.00683v1-abstract-full').style.display = 'inline'; document.getElementById('2406.00683v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.00683v1-abstract-full" style="display: none;"> Deep priors have emerged as potent methods in hyperspectral image (HSI) reconstruction. While most methods emphasize space-domain learning using image space priors like non-local similarity, frequency-domain learning using image frequency priors remains neglected, limiting the reconstruction capability of networks. In this paper, we first propose a Hyperspectral Frequency Correlation (HFC) prior rooted in in-depth statistical frequency analyses of existent HSI datasets. Leveraging the HFC prior, we subsequently establish the frequency domain learning composed of a Spectral-wise self-Attention of Frequency (SAF) and a Spectral-spatial Interaction of Frequency (SIF) targeting low-frequency and high-frequency components, respectively. The outputs of SAF and SIF are adaptively merged by a learnable gating filter, thus achieving a thorough exploitation of image frequency priors. Integrating the frequency domain learning and the existing space domain learning, we finally develop the Correlation-driven Mixing Domains Transformer (CMDT) for HSI reconstruction. Extensive experiments highlight that our method surpasses various state-of-the-art (SOTA) methods in reconstruction quality and computational efficiency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.00683v1-abstract-full').style.display = 'none'; document.getElementById('2406.00683v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages, 11 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.14300">arXiv:2405.14300</a> <span> [<a href="https://arxiv.org/pdf/2405.14300">pdf</a>, <a href="https://arxiv.org/format/2405.14300">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Automatic diagnosis of cardiac magnetic resonance images based on semi-supervised learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Huang%2C+H">Hejun Huang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Z">Zuguo Chen</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+Y">Yi Huang</a>, <a href="/search/eess?searchtype=author&query=Luo%2C+G">Guangqiang Luo</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+C">Chaoyang Chen</a>, <a href="/search/eess?searchtype=author&query=Song%2C+Y">Youzhi Song</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.14300v1-abstract-short" style="display: inline;"> Cardiac magnetic resonance imaging (MRI) is a pivotal tool for assessing cardiac function. Precise segmentation of cardiac structures is imperative for accurate cardiac functional evaluation. This paper introduces a semi-supervised model for automatic segmentation of cardiac images and auxiliary diagnosis. By harnessing cardiac MRI images and necessitating only a small portion of annotated image d… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.14300v1-abstract-full').style.display = 'inline'; document.getElementById('2405.14300v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.14300v1-abstract-full" style="display: none;"> Cardiac magnetic resonance imaging (MRI) is a pivotal tool for assessing cardiac function. Precise segmentation of cardiac structures is imperative for accurate cardiac functional evaluation. This paper introduces a semi-supervised model for automatic segmentation of cardiac images and auxiliary diagnosis. By harnessing cardiac MRI images and necessitating only a small portion of annotated image data, the model achieves fully automated, high-precision segmentation of cardiac images, extraction of features, calculation of clinical indices, and prediction of diseases. The provided segmentation results, clinical indices, and prediction outcomes can aid physicians in diagnosis, thereby serving as auxiliary diagnostic tools. Experimental results showcase that this semi-supervised model for automatic segmentation of cardiac images and auxiliary diagnosis attains high accuracy in segmentation and correctness in prediction, demonstrating substantial practical guidance and application value. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.14300v1-abstract-full').style.display = 'none'; document.getElementById('2405.14300v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.15992">arXiv:2404.15992</a> <span> [<a href="https://arxiv.org/pdf/2404.15992">pdf</a>, <a href="https://arxiv.org/format/2404.15992">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> GAN-HA: A generative adversarial network with a novel heterogeneous dual-discriminator network and a new attention-based fusion strategy for infrared and visible image fusion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Lu%2C+G">Guosheng Lu</a>, <a href="/search/eess?searchtype=author&query=Fang%2C+Z">Zile Fang</a>, <a href="/search/eess?searchtype=author&query=Tian%2C+J">Jiaju Tian</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+H">Haowen Huang</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+Y">Yuelong Xu</a>, <a href="/search/eess?searchtype=author&query=Han%2C+Z">Zhuolin Han</a>, <a href="/search/eess?searchtype=author&query=Kang%2C+Y">Yaoming Kang</a>, <a href="/search/eess?searchtype=author&query=Feng%2C+C">Can Feng</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+Z">Zhigang Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.15992v3-abstract-short" style="display: inline;"> Infrared and visible image fusion (IVIF) aims to preserve thermal radiation information from infrared images while integrating texture details from visible images. Thermal radiation information is mainly expressed through image intensities, while texture details are typically expressed through image gradients. However, existing dual-discriminator generative adversarial networks (GANs) often rely o… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.15992v3-abstract-full').style.display = 'inline'; document.getElementById('2404.15992v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.15992v3-abstract-full" style="display: none;"> Infrared and visible image fusion (IVIF) aims to preserve thermal radiation information from infrared images while integrating texture details from visible images. Thermal radiation information is mainly expressed through image intensities, while texture details are typically expressed through image gradients. However, existing dual-discriminator generative adversarial networks (GANs) often rely on two structurally identical discriminators for learning, which do not fully account for the distinct learning needs of infrared and visible image information. To this end, this paper proposes a novel GAN with a heterogeneous dual-discriminator network and an attention-based fusion strategy (GAN-HA). Specifically, recognizing the intrinsic differences between infrared and visible images, we propose, for the first time, a novel heterogeneous dual-discriminator network to simultaneously capture thermal radiation information and texture details. The two discriminators in this network are structurally different, including a salient discriminator for infrared images and a detailed discriminator for visible images. They are able to learn rich image intensity information and image gradient information, respectively. In addition, a new attention-based fusion strategy is designed in the generator to appropriately emphasize the learned information from different source images, thereby improving the information representation ability of the fusion result. In this way, the fused images generated by GAN-HA can more effectively maintain both the salience of thermal targets and the sharpness of textures. Extensive experiments on various public datasets demonstrate the superiority of GAN-HA over other state-of-the-art (SOTA) algorithms while showcasing its higher potential for practical applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.15992v3-abstract-full').style.display = 'none'; document.getElementById('2404.15992v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.09192">arXiv:2404.09192</a> <span> [<a href="https://arxiv.org/pdf/2404.09192">pdf</a>, <a href="https://arxiv.org/format/2404.09192">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Prior-agnostic Multi-scale Contrastive Text-Audio Pre-training for Parallelized TTS Frontend Modeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+Q">Quanxiu Wang</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+H">Hui Huang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+M">Mingjie Wang</a>, <a href="/search/eess?searchtype=author&query=Dai%2C+Y">Yong Dai</a>, <a href="/search/eess?searchtype=author&query=Zhong%2C+J">Jinzuomu Zhong</a>, <a href="/search/eess?searchtype=author&query=Tang%2C+B">Benlai Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.09192v1-abstract-short" style="display: inline;"> Over the past decade, a series of unflagging efforts have been dedicated to developing highly expressive and controllable text-to-speech (TTS) systems. In general, the holistic TTS comprises two interconnected components: the frontend module and the backend module. The frontend excels in capturing linguistic representations from the raw text input, while the backend module converts linguistic cues… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.09192v1-abstract-full').style.display = 'inline'; document.getElementById('2404.09192v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.09192v1-abstract-full" style="display: none;"> Over the past decade, a series of unflagging efforts have been dedicated to developing highly expressive and controllable text-to-speech (TTS) systems. In general, the holistic TTS comprises two interconnected components: the frontend module and the backend module. The frontend excels in capturing linguistic representations from the raw text input, while the backend module converts linguistic cues to speech. The research community has shown growing interest in the study of the frontend component, recognizing its pivotal role in text-to-speech systems, including Text Normalization (TN), Prosody Boundary Prediction (PBP), and Polyphone Disambiguation (PD). Nonetheless, the limitations posed by insufficient annotated textual data and the reliance on homogeneous text signals significantly undermine the effectiveness of its supervised learning. To evade this obstacle, a novel two-stage TTS frontend prediction pipeline, named TAP-FM, is proposed in this paper. Specifically, during the first learning phase, we present a Multi-scale Contrastive Text-audio Pre-training protocol (MC-TAP), which hammers at acquiring richer insights via multi-granularity contrastive pre-training in an unsupervised manner. Instead of mining homogeneous features in prior pre-training approaches, our framework demonstrates the ability to delve deep into both global and local text-audio semantic and acoustic representations. Furthermore, a parallelized TTS frontend model is delicately devised to execute TN, PD, and PBP prediction tasks, respectively in the second stage. Finally, extensive experiments illustrate the superiority of our proposed method, achieving state-of-the-art performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.09192v1-abstract-full').style.display = 'none'; document.getElementById('2404.09192v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.07477">arXiv:2404.07477</a> <span> [<a href="https://arxiv.org/pdf/2404.07477">pdf</a>, <a href="https://arxiv.org/format/2404.07477">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Integrated Sensing and Communication Under DISCO Physical-Layer Jamming Attacks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Huang%2C+H">Huan Huang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+H">Hongliang Zhang</a>, <a href="/search/eess?searchtype=author&query=Mei%2C+W">Weidong Mei</a>, <a href="/search/eess?searchtype=author&query=Li%2C+J">Jun Li</a>, <a href="/search/eess?searchtype=author&query=Cai%2C+Y">Yi Cai</a>, <a href="/search/eess?searchtype=author&query=Swindlehurst%2C+A+L">A. Lee Swindlehurst</a>, <a href="/search/eess?searchtype=author&query=Han%2C+Z">Zhu Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.07477v2-abstract-short" style="display: inline;"> Integrated sensing and communication (ISAC) systems traditionally presuppose that sensing and communication (S&C) channels remain approximately constant during their coherence time. However, a "DISCO" reconfigurable intelligent surface (DRIS), i.e., an illegitimate RIS with random, time-varying reflection properties that acts like a "disco ball," introduces a paradigm shift that enables active cha… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.07477v2-abstract-full').style.display = 'inline'; document.getElementById('2404.07477v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.07477v2-abstract-full" style="display: none;"> Integrated sensing and communication (ISAC) systems traditionally presuppose that sensing and communication (S&C) channels remain approximately constant during their coherence time. However, a "DISCO" reconfigurable intelligent surface (DRIS), i.e., an illegitimate RIS with random, time-varying reflection properties that acts like a "disco ball," introduces a paradigm shift that enables active channel aging more rapidly during the channel coherence time. In this letter, we investigate the impact of DISCO jamming attacks launched by a DRISbased fully-passive jammer (FPJ) on an ISAC system. Specifically, an ISAC problem formulation and a corresponding waveform optimization are presented in which the ISAC waveform design considers the trade-off between the S&C performance and is formulated as a Pareto optimization problem. Moreover, a theoretical analysis is conducted to quantify the impact of DISCO jamming attacks. Numerical results are presented to evaluate the S&C performance under DISCO jamming attacks and to validate the derived theoretical analysis. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.07477v2-abstract-full').style.display = 'none'; document.getElementById('2404.07477v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper has been submitted for possible publication. For the code of the DISCO RIS is available on Github (https://github.com/huanhuan1799/Disco-Intelligent-Reflecting-Surfaces-Active-Channel-Aging-for-Fully-Passive-Jamming-Attacks)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.07092">arXiv:2404.07092</a> <span> [<a href="https://arxiv.org/pdf/2404.07092">pdf</a>, <a href="https://arxiv.org/format/2404.07092">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optics">physics.optics</span> </div> </div> <p class="title is-5 mathjax"> Net 835-Gb/s/位 Carrier- and LO-Free 100-km Transmission Using Channel-Aware Phase Retrieval Reception </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Huang%2C+H">Hanzi Huang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+H">Haoshuo Chen</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+Q">Qian Hu</a>, <a href="/search/eess?searchtype=author&query=Che%2C+D">Di Che</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+Y">Yetian Huang</a>, <a href="/search/eess?searchtype=author&query=Stern%2C+B">Brian Stern</a>, <a href="/search/eess?searchtype=author&query=Fontaine%2C+N+K">Nicolas K. Fontaine</a>, <a href="/search/eess?searchtype=author&query=Mazur%2C+M">Mikael Mazur</a>, <a href="/search/eess?searchtype=author&query=Dallachiesa%2C+L">Lauren Dallachiesa</a>, <a href="/search/eess?searchtype=author&query=Ryf%2C+R">Roland Ryf</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Z">Zhengxuan Li</a>, <a href="/search/eess?searchtype=author&query=Song%2C+Y">Yingxiong Song</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.07092v1-abstract-short" style="display: inline;"> We experimentally demonstrate the first carrier- and LO-free 800G/位 receiver enabling direct compatibility with standard coherent transmitters via phase retrieval, achieving net 835-Gb/s transmission over 100-km SMF and record 8.27-b/s/Hz net optical spectral efficiency. </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.07092v1-abstract-full" style="display: none;"> We experimentally demonstrate the first carrier- and LO-free 800G/位 receiver enabling direct compatibility with standard coherent transmitters via phase retrieval, achieving net 835-Gb/s transmission over 100-km SMF and record 8.27-b/s/Hz net optical spectral efficiency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.07092v1-abstract-full').style.display = 'none'; document.getElementById('2404.07092v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">3 pages, 3 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.05834">arXiv:2403.05834</a> <span> [<a href="https://arxiv.org/pdf/2403.05834">pdf</a>, <a href="https://arxiv.org/format/2403.05834">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Expressiveness in Dance Generation via Integrating Frequency and Music Style Information </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Huang%2C+Q">Qiaochu Huang</a>, <a href="/search/eess?searchtype=author&query=He%2C+X">Xu He</a>, <a href="/search/eess?searchtype=author&query=Tang%2C+B">Boshi Tang</a>, <a href="/search/eess?searchtype=author&query=Zhuang%2C+H">Haolin Zhuang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+L">Liyang Chen</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+S">Shuochen Gao</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+Z">Zhiyong Wu</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+H">Haozhi Huang</a>, <a href="/search/eess?searchtype=author&query=Meng%2C+H">Helen Meng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.05834v1-abstract-short" style="display: inline;"> Dance generation, as a branch of human motion generation, has attracted increasing attention. Recently, a few works attempt to enhance dance expressiveness, which includes genre matching, beat alignment, and dance dynamics, from certain aspects. However, the enhancement is quite limited as they lack comprehensive consideration of the aforementioned three factors. In this paper, we propose Expressi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.05834v1-abstract-full').style.display = 'inline'; document.getElementById('2403.05834v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.05834v1-abstract-full" style="display: none;"> Dance generation, as a branch of human motion generation, has attracted increasing attention. Recently, a few works attempt to enhance dance expressiveness, which includes genre matching, beat alignment, and dance dynamics, from certain aspects. However, the enhancement is quite limited as they lack comprehensive consideration of the aforementioned three factors. In this paper, we propose ExpressiveBailando, a novel dance generation method designed to generate expressive dances, concurrently taking all three factors into account. Specifically, we mitigate the issue of speed homogenization by incorporating frequency information into VQ-VAE, thus improving dance dynamics. Additionally, we integrate music style information by extracting genre- and beat-related features with a pre-trained music model, hence achieving improvements in the other two factors. Extensive experimental results demonstrate that our proposed method can generate dances with high expressiveness and outperforms existing methods both qualitatively and quantitatively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.05834v1-abstract-full').style.display = 'none'; document.getElementById('2403.05834v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.02566">arXiv:2403.02566</a> <span> [<a href="https://arxiv.org/pdf/2403.02566">pdf</a>, <a href="https://arxiv.org/format/2403.02566">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Weakly Supervised 3D Medical Image Segmentation through Probabilistic-aware Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Fan%2C+Z">Zhaoxin Fan</a>, <a href="/search/eess?searchtype=author&query=Jiang%2C+R">Runmin Jiang</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+J">Junhao Wu</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+X">Xin Huang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+T">Tianyang Wang</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+H">Heng Huang</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+M">Min Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.02566v1-abstract-short" style="display: inline;"> 3D medical image segmentation is a challenging task with crucial implications for disease diagnosis and treatment planning. Recent advances in deep learning have significantly enhanced fully supervised medical image segmentation. However, this approach heavily relies on labor-intensive and time-consuming fully annotated ground-truth labels, particularly for 3D volumes. To overcome this limitation,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.02566v1-abstract-full').style.display = 'inline'; document.getElementById('2403.02566v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.02566v1-abstract-full" style="display: none;"> 3D medical image segmentation is a challenging task with crucial implications for disease diagnosis and treatment planning. Recent advances in deep learning have significantly enhanced fully supervised medical image segmentation. However, this approach heavily relies on labor-intensive and time-consuming fully annotated ground-truth labels, particularly for 3D volumes. To overcome this limitation, we propose a novel probabilistic-aware weakly supervised learning pipeline, specifically designed for 3D medical imaging. Our pipeline integrates three innovative components: a probability-based pseudo-label generation technique for synthesizing dense segmentation masks from sparse annotations, a Probabilistic Multi-head Self-Attention network for robust feature extraction within our Probabilistic Transformer Network, and a Probability-informed Segmentation Loss Function to enhance training with annotation confidence. Demonstrating significant advances, our approach not only rivals the performance of fully supervised methods but also surpasses existing weakly supervised methods in CT and MRI datasets, achieving up to 18.1% improvement in Dice scores for certain organs. The code is available at https://github.com/runminjiang/PW4MedSeg. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.02566v1-abstract-full').style.display = 'none'; document.getElementById('2403.02566v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.15738">arXiv:2402.15738</a> <span> [<a href="https://arxiv.org/pdf/2402.15738">pdf</a>, <a href="https://arxiv.org/format/2402.15738">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Privacy-Preserving State Estimation in the Presence of Eavesdroppers: A Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yan%2C+X">Xinhao Yan</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+G">Guanzhong Zhou</a>, <a href="/search/eess?searchtype=author&query=Quevedo%2C+D+E">Daniel E. Quevedo</a>, <a href="/search/eess?searchtype=author&query=Murguia%2C+C">Carlos Murguia</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+B">Bo Chen</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+H">Hailong Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.15738v1-abstract-short" style="display: inline;"> Networked systems are increasingly the target of cyberattacks that exploit vulnerabilities within digital communications, embedded hardware, and software. Arguably, the simplest class of attacks -- and often the first type before launching destructive integrity attacks -- are eavesdropping attacks, which aim to infer information by collecting system data and exploiting it for malicious purposes. A… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.15738v1-abstract-full').style.display = 'inline'; document.getElementById('2402.15738v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.15738v1-abstract-full" style="display: none;"> Networked systems are increasingly the target of cyberattacks that exploit vulnerabilities within digital communications, embedded hardware, and software. Arguably, the simplest class of attacks -- and often the first type before launching destructive integrity attacks -- are eavesdropping attacks, which aim to infer information by collecting system data and exploiting it for malicious purposes. A key technology of networked systems is state estimation, which leverages sensing and actuation data and first-principles models to enable trajectory planning, real-time monitoring, and control. However, state estimation can also be exploited by eavesdroppers to identify models and reconstruct states with the aim of, e.g., launching integrity (stealthy) attacks and inferring sensitive information. It is therefore crucial to protect disclosed system data to avoid an accurate state estimation by eavesdroppers. This survey presents a comprehensive review of existing literature on privacy-preserving state estimation methods, while also identifying potential limitations and research gaps. Our primary focus revolves around three types of methods: cryptography, data perturbation, and transmission scheduling, with particular emphasis on Kalman-like filters. Within these categories, we delve into the concepts of homomorphic encryption and differential privacy, which have been extensively investigated in recent years in the context of privacy-preserving state estimation. Finally, we shed light on several technical and fundamental challenges surrounding current methods and propose potential directions for future research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.15738v1-abstract-full').style.display = 'none'; document.getElementById('2402.15738v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages, 5 figures, 4 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.15693">arXiv:2402.15693</a> <span> [<a href="https://arxiv.org/pdf/2402.15693">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> </div> </div> <p class="title is-5 mathjax"> Photolithography Control System : A Case Study For Cyber-Physical System </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+Y">Youbao Zhang</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+H">Huijie Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.15693v1-abstract-short" style="display: inline;"> Photolithography control system (PCS) is an extremely complex distributed control system, which is composed of dozens of networked microprocessors, hundreds of actuators, hundreds of thousands of sensors, and millions of lines of code. Cyber-physical system (CPS), which deeply merges computation with physical processes together, copes with complex system from a higher level of abstraction. PCS is… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.15693v1-abstract-full').style.display = 'inline'; document.getElementById('2402.15693v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.15693v1-abstract-full" style="display: none;"> Photolithography control system (PCS) is an extremely complex distributed control system, which is composed of dozens of networked microprocessors, hundreds of actuators, hundreds of thousands of sensors, and millions of lines of code. Cyber-physical system (CPS), which deeply merges computation with physical processes together, copes with complex system from a higher level of abstraction. PCS is a representative CPS. This work points out that thinking under the framework of CPS, which includes holistic perspective, model-based design, hardware/software co-design and continuous integration, could solve the issues presented in the current PCS. Although the traditional embedded system approach and the CPS approach would be coexisting in the PCS for a long time, the CPS approach is definitely the future of the PCS development. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.15693v1-abstract-full').style.display = 'none'; document.getElementById('2402.15693v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">22 pages, 10 figures, 4 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.02411">arXiv:2402.02411</a> <span> [<a href="https://arxiv.org/pdf/2402.02411">pdf</a>, <a href="https://arxiv.org/format/2402.02411">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Physics-Inspired Degradation Models for Hyperspectral Image Fusion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Lian%2C+J">Jie Lian</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+L">Lizhi Wang</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+L">Lin Zhu</a>, <a href="/search/eess?searchtype=author&query=Dian%2C+R">Renwei Dian</a>, <a href="/search/eess?searchtype=author&query=Xiong%2C+Z">Zhiwei Xiong</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+H">Hua Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.02411v1-abstract-short" style="display: inline;"> The fusion of a low-spatial-resolution hyperspectral image (LR-HSI) with a high-spatial-resolution multispectral image (HR-MSI) has garnered increasing research interest. However, most fusion methods solely focus on the fusion algorithm itself and overlook the degradation models, which results in unsatisfactory performance in practical scenarios. To fill this gap, we propose physics-inspired degra… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.02411v1-abstract-full').style.display = 'inline'; document.getElementById('2402.02411v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.02411v1-abstract-full" style="display: none;"> The fusion of a low-spatial-resolution hyperspectral image (LR-HSI) with a high-spatial-resolution multispectral image (HR-MSI) has garnered increasing research interest. However, most fusion methods solely focus on the fusion algorithm itself and overlook the degradation models, which results in unsatisfactory performance in practical scenarios. To fill this gap, we propose physics-inspired degradation models (PIDM) to model the degradation of LR-HSI and HR-MSI, which comprises a spatial degradation network (SpaDN) and a spectral degradation network (SpeDN). SpaDN and SpeDN are designed based on two insights. First, we employ spatial warping and spectral modulation operations to simulate lens aberrations, thereby introducing non-uniformity into the spatial and spectral degradation processes. Second, we utilize asymmetric downsampling and parallel downsampling operations to separately reduce the spatial and spectral resolutions of the images, thus ensuring the matching of spatial and spectral degradation processes with specific physical characteristics. Once SpaDN and SpeDN are established, we adopt a self-supervised training strategy to optimize the network parameters and provide a plug-and-play solution for fusion methods. Comprehensive experiments demonstrate that our proposed PIDM can boost the fusion performance of existing fusion methods in practical scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.02411v1-abstract-full').style.display = 'none'; document.getElementById('2402.02411v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.02349">arXiv:2402.02349</a> <span> [<a href="https://arxiv.org/pdf/2402.02349">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> 3D Lymphoma Segmentation on PET/CT Images via Multi-Scale Information Fusion with Cross-Attention </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Huang%2C+H">Huan Huang</a>, <a href="/search/eess?searchtype=author&query=Qiu%2C+L">Liheng Qiu</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+S">Shenmiao Yang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+L">Longxi Li</a>, <a href="/search/eess?searchtype=author&query=Nan%2C+J">Jiaofen Nan</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yanting Li</a>, <a href="/search/eess?searchtype=author&query=Han%2C+C">Chuang Han</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+F">Fubao Zhu</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+C">Chen Zhao</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+W">Weihua Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.02349v2-abstract-short" style="display: inline;"> Background: Accurate segmentation of diffuse large B-cell lymphoma (DLBCL) lesions is challenging due to their complex patterns in medical imaging. Objective: This study aims to develop a precise segmentation method for DLBCL using 18F-Fluorodeoxyglucose (FDG) positron emission tomography (PET) and computed tomography (CT) images. Methods: We propose a 3D dual-branch encoder segmentation metho… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.02349v2-abstract-full').style.display = 'inline'; document.getElementById('2402.02349v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.02349v2-abstract-full" style="display: none;"> Background: Accurate segmentation of diffuse large B-cell lymphoma (DLBCL) lesions is challenging due to their complex patterns in medical imaging. Objective: This study aims to develop a precise segmentation method for DLBCL using 18F-Fluorodeoxyglucose (FDG) positron emission tomography (PET) and computed tomography (CT) images. Methods: We propose a 3D dual-branch encoder segmentation method using shifted window transformers and a Multi-Scale Information Fusion (MSIF) module. To enhance feature integration, the MSIF module performs multi-scale feature fusion using cross-attention mechanisms with a shifted window framework. A gated neural network within the MSIF module dynamically balances the contributions from each modality. The model was optimized using the Dice Similarity Coefficient (DSC) loss function. Additionally, we computed the total metabolic tumor volume (TMTV) and performed statistical analyses. Results: The model was trained and validated on a dataset of 165 DLBCL patients using 5-fold cross-validation, achieving a DSC of 0.7512. Statistical analysis showed a significant improvement over comparative methods (p < 0.05). Additionally, a Pearson correlation coefficient of 0.91 and an R^2 of 0.89 were observed when comparing manual annotations to segmentation results for TMTV measurement. Conclusion: This study presents an effective automatic segmentation method for DLBCL that leverages the complementary strengths of PET and CT imaging. Our method has the potential to improve diagnostic interpretations and assist in treatment planning for DLBCL patients. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.02349v2-abstract-full').style.display = 'none'; document.getElementById('2402.02349v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">19 pages, 7 figures; reference added</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.16087">arXiv:2401.16087</a> <span> [<a href="https://arxiv.org/pdf/2401.16087">pdf</a>, <a href="https://arxiv.org/format/2401.16087">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> High Resolution Image Quality Database </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Huang%2C+H">Huang Huang</a>, <a href="/search/eess?searchtype=author&query=Wan%2C+Q">Qiang Wan</a>, <a href="/search/eess?searchtype=author&query=Korhonen%2C+J">Jari Korhonen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.16087v1-abstract-short" style="display: inline;"> With technology for digital photography and high resolution displays rapidly evolving and gaining popularity, there is a growing demand for blind image quality assessment (BIQA) models for high resolution images. Unfortunately, the publicly available large scale image quality databases used for training BIQA models contain mostly low or general resolution images. Since image resizing affects image… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.16087v1-abstract-full').style.display = 'inline'; document.getElementById('2401.16087v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.16087v1-abstract-full" style="display: none;"> With technology for digital photography and high resolution displays rapidly evolving and gaining popularity, there is a growing demand for blind image quality assessment (BIQA) models for high resolution images. Unfortunately, the publicly available large scale image quality databases used for training BIQA models contain mostly low or general resolution images. Since image resizing affects image quality, we assume that the accuracy of BIQA models trained on low resolution images would not be optimal for high resolution images. Therefore, we created a new high resolution image quality database (HRIQ), consisting of 1120 images with resolution of 2880x2160 pixels. We conducted a subjective study to collect the subjective quality ratings for HRIQ in a controlled laboratory setting, resulting in accurate MOS at high resolution. To demonstrate the importance of a high resolution image quality database for training BIQA models to predict mean opinion scores (MOS) of high resolution images accurately, we trained and tested several traditional and deep learning based BIQA methods on different resolution versions of our database. The database is publicly available in https://github.com/jarikorhonen/hriq. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.16087v1-abstract-full').style.display = 'none'; document.getElementById('2401.16087v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.09036">arXiv:2401.09036</a> <span> [<a href="https://arxiv.org/pdf/2401.09036">pdf</a>, <a href="https://arxiv.org/format/2401.09036">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> IRS-Enhanced Anti-Jamming Precoding Against DISCO Physical Layer Jamming Attacks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Huang%2C+H">Huan Huang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+H">Hongliang Zhang</a>, <a href="/search/eess?searchtype=author&query=Cai%2C+Y">Yi Cai</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Y">Yunjing Zhang</a>, <a href="/search/eess?searchtype=author&query=Swindlehurst%2C+A+L">A. Lee Swindlehurst</a>, <a href="/search/eess?searchtype=author&query=Han%2C+Z">Zhu Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.09036v1-abstract-short" style="display: inline;"> Illegitimate intelligent reflective surfaces (IRSs) can pose significant physical layer security risks on multi-user multiple-input single-output (MU-MISO) systems. Recently, a DISCO approach has been proposed an illegitimate IRS with random and time-varying reflection coefficients, referred to as a "disco" IRS (DIRS). Such DIRS can attack MU-MISO systems without relying on either jamming power or… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.09036v1-abstract-full').style.display = 'inline'; document.getElementById('2401.09036v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.09036v1-abstract-full" style="display: none;"> Illegitimate intelligent reflective surfaces (IRSs) can pose significant physical layer security risks on multi-user multiple-input single-output (MU-MISO) systems. Recently, a DISCO approach has been proposed an illegitimate IRS with random and time-varying reflection coefficients, referred to as a "disco" IRS (DIRS). Such DIRS can attack MU-MISO systems without relying on either jamming power or channel state information (CSI), and classical anti-jamming techniques are ineffective for the DIRS-based fully-passive jammers (DIRS-based FPJs). In this paper, we propose an IRS-enhanced anti-jamming precoder against DIRS-based FPJs that requires only statistical rather than instantaneous CSI of the DIRS-jammed channels. Specifically, a legitimate IRS is introduced to reduce the strength of the DIRS-based jamming relative to the transmit signals at a legitimate user (LU). In addition, the active beamforming at the legitimate access point (AP) is designed to maximize the signal-to-jamming-plus-noise ratios (SJNRs). Numerical results are presented to evaluate the effectiveness of the proposed IRS-enhanced anti-jamming precoder against DIRS-based FPJs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.09036v1-abstract-full').style.display = 'none'; document.getElementById('2401.09036v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper has been accepted by IEEE ICC 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.07398">arXiv:2401.07398</a> <span> [<a href="https://arxiv.org/pdf/2401.07398">pdf</a>, <a href="https://arxiv.org/format/2401.07398">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/ACCESS.2024.3436620">10.1109/ACCESS.2024.3436620 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Cross Domain Early Crop Mapping using CropSTGAN </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yiqun Wang</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+H">Hui Huang</a>, <a href="/search/eess?searchtype=author&query=State%2C+R">Radu State</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.07398v2-abstract-short" style="display: inline;"> Driven by abundant satellite imagery, machine learning-based approaches have recently been promoted to generate high-resolution crop cultivation maps to support many agricultural applications. One of the major challenges faced by these approaches is the limited availability of ground truth labels. In the absence of ground truth, existing work usually adopts the "direct transfer strategy" that trai… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.07398v2-abstract-full').style.display = 'inline'; document.getElementById('2401.07398v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.07398v2-abstract-full" style="display: none;"> Driven by abundant satellite imagery, machine learning-based approaches have recently been promoted to generate high-resolution crop cultivation maps to support many agricultural applications. One of the major challenges faced by these approaches is the limited availability of ground truth labels. In the absence of ground truth, existing work usually adopts the "direct transfer strategy" that trains a classifier using historical labels collected from other regions and then applies the trained model to the target region. Unfortunately, the spectral features of crops exhibit inter-region and inter-annual variability due to changes in soil composition, climate conditions, and crop progress, the resultant models perform poorly on new and unseen regions or years. Despite recent efforts, such as the application of the deep adaptation neural network (DANN) model structure in the deep adaptation crop classification network (DACCN), to tackle the above cross-domain challenges, their effectiveness diminishes significantly when there is a large dissimilarity between the source and target regions. This paper introduces the Crop Mapping Spectral-temporal Generative Adversarial Neural Network (CropSTGAN), a novel solution for cross-domain challenges, that doesn't require target domain labels. CropSTGAN learns to transform the target domain's spectral features to those of the source domain, effectively bridging large dissimilarities. Additionally, it employs an identity loss to maintain the intrinsic local structure of the data. Comprehensive experiments across various regions and years demonstrate the benefits and effectiveness of the proposed approach. In experiments, CropSTGAN is benchmarked against various state-of-the-art (SOTA) methods. Notably, CropSTGAN significantly outperforms these methods in scenarios with large data distribution dissimilarities between the target and source domains. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.07398v2-abstract-full').style.display = 'none'; document.getElementById('2401.07398v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.15921">arXiv:2312.15921</a> <span> [<a href="https://arxiv.org/pdf/2312.15921">pdf</a>, <a href="https://arxiv.org/format/2312.15921">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Hybrid Precoder Design for Angle-of-Departure Estimation with Limited-Resolution Phase Shifters </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Huang%2C+H">Huiping Huang</a>, <a href="/search/eess?searchtype=author&query=Keskin%2C+M+F">Musa Furkan Keskin</a>, <a href="/search/eess?searchtype=author&query=Wymeersch%2C+H">Henk Wymeersch</a>, <a href="/search/eess?searchtype=author&query=Cai%2C+X">Xuesong Cai</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+L">Linlong Wu</a>, <a href="/search/eess?searchtype=author&query=Thunberg%2C+J">Johan Thunberg</a>, <a href="/search/eess?searchtype=author&query=Tufvesson%2C+F">Fredrik Tufvesson</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.15921v2-abstract-short" style="display: inline;"> Hybrid analog-digital beamforming stands out as a key enabler for future communication systems with a massive number of antennas. In this paper, we investigate the hybrid precoder design problem for angle-of-departure (AoD) estimation, where we take into account the practical constraint on the limited resolution of phase shifters. Our goal is to design a radio-frequency (RF) precoder and a base-ba… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.15921v2-abstract-full').style.display = 'inline'; document.getElementById('2312.15921v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.15921v2-abstract-full" style="display: none;"> Hybrid analog-digital beamforming stands out as a key enabler for future communication systems with a massive number of antennas. In this paper, we investigate the hybrid precoder design problem for angle-of-departure (AoD) estimation, where we take into account the practical constraint on the limited resolution of phase shifters. Our goal is to design a radio-frequency (RF) precoder and a base-band (BB) precoder to estimate AoD of the user with a high accuracy. To this end, we propose a two-step strategy where we first obtain the fully digital precoder that minimizes the angle error bound, and then the resulting digital precoder is decomposed into an RF precoder and a BB precoder, based on the alternating optimization and the alternating direction method of multipliers. Besides, we derive the quantization error upper bound and analyse the convergence behavior of the proposed algorithm. Numerical results demonstrate the superior performance of the proposed method over state-of-the-art baselines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.15921v2-abstract-full').style.display = 'none'; document.getElementById('2312.15921v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper has been accepted for publication in IEEE Transactions on Communications</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.15380">arXiv:2312.15380</a> <span> [<a href="https://arxiv.org/pdf/2312.15380">pdf</a>, <a href="https://arxiv.org/format/2312.15380">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Battery-Care Resource Allocation and Task Offloading in Multi-Agent Post-Disaster MEC Environment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Tang%2C+Y">Yiwei Tang</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+H">Hualong Huang</a>, <a href="/search/eess?searchtype=author&query=Zhan%2C+W">Wenhan Zhan</a>, <a href="/search/eess?searchtype=author&query=Min%2C+G">Geyong Min</a>, <a href="/search/eess?searchtype=author&query=Duan%2C+Z">Zhekai Duan</a>, <a href="/search/eess?searchtype=author&query=Lei%2C+Y">Yuchuan Lei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.15380v1-abstract-short" style="display: inline;"> Being an up-and-coming application scenario of mobile edge computing (MEC), the post-disaster rescue suffers multitudinous computing-intensive tasks but unstably guaranteed network connectivity. In rescue environments, quality of service (QoS), such as task execution delay, energy consumption and battery state of health (SoH), is of significant meaning. This paper studies a multi-user post-disaste… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.15380v1-abstract-full').style.display = 'inline'; document.getElementById('2312.15380v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.15380v1-abstract-full" style="display: none;"> Being an up-and-coming application scenario of mobile edge computing (MEC), the post-disaster rescue suffers multitudinous computing-intensive tasks but unstably guaranteed network connectivity. In rescue environments, quality of service (QoS), such as task execution delay, energy consumption and battery state of health (SoH), is of significant meaning. This paper studies a multi-user post-disaster MEC environment with unstable 5G communication, where device-to-device (D2D) link communication and dynamic voltage and frequency scaling (DVFS) are adopted to balance each user's requirement for task delay and energy consumption. A battery degradation evaluation approach to prolong battery lifetime is also presented. The distributed optimization problem is formulated into a mixed cooperative-competitive (MCC) multi-agent Markov decision process (MAMDP) and is tackled with recurrent multi-agent Proximal Policy Optimization (rMAPPO). Extensive simulations and comprehensive comparisons with other representative algorithms clearly demonstrate the effectiveness of the proposed rMAPPO-based offloading scheme. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.15380v1-abstract-full').style.display = 'none'; document.getElementById('2312.15380v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted by wcnc2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.14776">arXiv:2312.14776</a> <span> [<a href="https://arxiv.org/pdf/2312.14776">pdf</a>, <a href="https://arxiv.org/format/2312.14776">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Compressing Image-to-Image Translation GANs Using Local Density Structures on Their Learned Manifold </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ganjdanesh%2C+A">Alireza Ganjdanesh</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+S">Shangqian Gao</a>, <a href="/search/eess?searchtype=author&query=Alipanah%2C+H">Hirad Alipanah</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+H">Heng Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.14776v1-abstract-short" style="display: inline;"> Generative Adversarial Networks (GANs) have shown remarkable success in modeling complex data distributions for image-to-image translation. Still, their high computational demands prohibit their deployment in practical scenarios like edge devices. Existing GAN compression methods mainly rely on knowledge distillation or convolutional classifiers' pruning techniques. Thus, they neglect the critical… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.14776v1-abstract-full').style.display = 'inline'; document.getElementById('2312.14776v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.14776v1-abstract-full" style="display: none;"> Generative Adversarial Networks (GANs) have shown remarkable success in modeling complex data distributions for image-to-image translation. Still, their high computational demands prohibit their deployment in practical scenarios like edge devices. Existing GAN compression methods mainly rely on knowledge distillation or convolutional classifiers' pruning techniques. Thus, they neglect the critical characteristic of GANs: their local density structure over their learned manifold. Accordingly, we approach GAN compression from a new perspective by explicitly encouraging the pruned model to preserve the density structure of the original parameter-heavy model on its learned manifold. We facilitate this objective for the pruned model by partitioning the learned manifold of the original generator into local neighborhoods around its generated samples. Then, we propose a novel pruning objective to regularize the pruned model to preserve the local density structure over each neighborhood, resembling the kernel density estimation method. Also, we develop a collaborative pruning scheme in which the discriminator and generator are pruned by two pruning agents. We design the agents to capture interactions between the generator and discriminator by exchanging their peer's feedback when determining corresponding models' architectures. Thanks to such a design, our pruning method can efficiently find performant sub-networks and can maintain the balance between the generator and discriminator more effectively compared to baselines during pruning, thereby showing more stable pruning dynamics. Our experiments on image translation GAN models, Pix2Pix and CycleGAN, with various benchmark datasets and architectures demonstrate our method's effectiveness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.14776v1-abstract-full').style.display = 'none'; document.getElementById('2312.14776v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The 38th Annual AAAI Conference on Artificial Intelligence, AAAI 2024</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Huang%2C+H&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Huang%2C+H&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Huang%2C+H&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Huang%2C+H&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Huang%2C+H&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Huang%2C+H&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository