CINXE.COM
Search | arXiv e-print repository
<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 337 results for author: <span class="mathjax">Zhang, Q</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> <div class="content"> <form method="GET" action="/search/eess" aria-role="search"> Searching in archive <strong>eess</strong>. <a href="/search/?searchtype=author&query=Zhang%2C+Q">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Zhang, Q"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Zhang%2C+Q&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Zhang, Q"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Zhang%2C+Q&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Zhang%2C+Q&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+Q&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+Q&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+Q&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+Q&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+Q&start=250" class="pagination-link " aria-label="Page 6" aria-current="page">6 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+Q&start=300" class="pagination-link " aria-label="Page 7" aria-current="page">7 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.00715">arXiv:2412.00715</a> <span> [<a href="https://arxiv.org/pdf/2412.00715">pdf</a>, <a href="https://arxiv.org/format/2412.00715">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> A Semi-Supervised Approach with Error Reflection for Echocardiography Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Han%2C+X">Xiaoxiang Han</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+Y">Yiman Liu</a>, <a href="/search/eess?searchtype=author&query=Shang%2C+J">Jiang Shang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Q">Qingli Li</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+J">Jiangang Chen</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+M">Menghan Hu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Q">Qi Zhang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Y">Yuqi Zhang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yan Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.00715v1-abstract-short" style="display: inline;"> Segmenting internal structure from echocardiography is essential for the diagnosis and treatment of various heart diseases. Semi-supervised learning shows its ability in alleviating annotations scarcity. While existing semi-supervised methods have been successful in image segmentation across various medical imaging modalities, few have attempted to design methods specifically addressing the challe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.00715v1-abstract-full').style.display = 'inline'; document.getElementById('2412.00715v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.00715v1-abstract-full" style="display: none;"> Segmenting internal structure from echocardiography is essential for the diagnosis and treatment of various heart diseases. Semi-supervised learning shows its ability in alleviating annotations scarcity. While existing semi-supervised methods have been successful in image segmentation across various medical imaging modalities, few have attempted to design methods specifically addressing the challenges posed by the poor contrast, blurred edge details and noise of echocardiography. These characteristics pose challenges to the generation of high-quality pseudo-labels in semi-supervised segmentation based on Mean Teacher. Inspired by human reflection on erroneous practices, we devise an error reflection strategy for echocardiography semi-supervised segmentation architecture. The process triggers the model to reflect on inaccuracies in unlabeled image segmentation, thereby enhancing the robustness of pseudo-label generation. Specifically, the strategy is divided into two steps. The first step is called reconstruction reflection. The network is tasked with reconstructing authentic proxy images from the semantic masks of unlabeled images and their auxiliary sketches, while maximizing the structural similarity between the original inputs and the proxies. The second step is called guidance correction. Reconstruction error maps decouple unreliable segmentation regions. Then, reliable data that are more likely to occur near high-density areas are leveraged to guide the optimization of unreliable data potentially located around decision boundaries. Additionally, we introduce an effective data augmentation strategy, termed as multi-scale mixing up strategy, to minimize the empirical distribution gap between labeled and unlabeled images and perceive diverse scales of cardiac anatomical structures. Extensive experiments demonstrate the competitiveness of the proposed method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.00715v1-abstract-full').style.display = 'none'; document.getElementById('2412.00715v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">6 pages, 4 figure, accepted by 2024 IEEE International Conference on Bioinformatics and Biomedicine (BIBM 2024)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12228">arXiv:2411.12228</a> <span> [<a href="https://arxiv.org/pdf/2411.12228">pdf</a>, <a href="https://arxiv.org/format/2411.12228">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Robust Deep Joint Source-Channel Coding Enabled Distributed Image Transmission with Imperfect Channel State Information </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Dong%2C+B">Biao Dong</a>, <a href="/search/eess?searchtype=author&query=Cao%2C+B">Bin Cao</a>, <a href="/search/eess?searchtype=author&query=Gui%2C+G">Guan Gui</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Q">Qinyu Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12228v1-abstract-short" style="display: inline;"> This work is concerned with robust distributed multi-view image transmission over a severe fading channel with imperfect channel state information (CSI), wherein the sources are slightly correlated. Since the signals are further distorted at the decoder, traditional distributed deep joint source-channel coding (DJSCC) suffers considerable performance degradation. To tackle this problem, we leverag… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12228v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12228v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12228v1-abstract-full" style="display: none;"> This work is concerned with robust distributed multi-view image transmission over a severe fading channel with imperfect channel state information (CSI), wherein the sources are slightly correlated. Since the signals are further distorted at the decoder, traditional distributed deep joint source-channel coding (DJSCC) suffers considerable performance degradation. To tackle this problem, we leverage the complementarity and consistency characteristics among the distributed, yet correlated sources, and propose an enhanced robust DJSCC, namely RDJSCC. In RDJSCC, we design a novel cross-view information extraction (CVIE) mechanism to capture more nuanced cross-view patterns and dependencies. In addition, a complementarity-consistency fusion (CCF) mechanism is utilized to fuse the complementarity and consistency from multi-view information in a symmetric and compact manner. Theoretical analysis and simulation results show that our proposed RDJSCC can effectively leverage the advantages of correlated sources even under severe fading conditions, leading to an improved reconstruction performance. The open source code of this work is available at:https://dongbiao26.github.io/rdjscc/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12228v1-abstract-full').style.display = 'none'; document.getElementById('2411.12228v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.08376">arXiv:2411.08376</a> <span> [<a href="https://arxiv.org/pdf/2411.08376">pdf</a>, <a href="https://arxiv.org/format/2411.08376">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Transfer Learning Guided Noise Reduction for Automatic Modulation Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ji%2C+Z">Zelin Ji</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+S">Shuo Wang</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+K">Kuojun Yang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Q">Qinchuan Zhang</a>, <a href="/search/eess?searchtype=author&query=Ye%2C+P">Peng Ye</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.08376v1-abstract-short" style="display: inline;"> Automatic modulation classification (AMC) has emerged as a key technique in cognitive radio networks in sixth-generation (6G) communications. AMC enables effective data transmission without requiring prior knowledge of modulation schemes. However, the low classification accuracy under the condition of low signal-to-noise ratio (SNR) limits the implementation of AMC techniques under the rapidly cha… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08376v1-abstract-full').style.display = 'inline'; document.getElementById('2411.08376v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.08376v1-abstract-full" style="display: none;"> Automatic modulation classification (AMC) has emerged as a key technique in cognitive radio networks in sixth-generation (6G) communications. AMC enables effective data transmission without requiring prior knowledge of modulation schemes. However, the low classification accuracy under the condition of low signal-to-noise ratio (SNR) limits the implementation of AMC techniques under the rapidly changing physical channels in 6G and beyond. This paper investigates the AMC technique for the signals with dynamic and varying SNRs, and a deep learning based noise reduction network is proposed to reduce the noise introduced by the wireless channel and the receiving equipment. In particular, a transfer learning guided learning framework (TNR-AMC) is proposed to utilize the scarce annotated modulation signals and improve the classification accuracy for low SNR modulation signals. The numerical results show that the proposed noise reduction network achieves an accuracy improvement of over 20\% in low SNR scenarios, and the TNR-AMC framework can improve the classification accuracy under unstable SNRs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08376v1-abstract-full').style.display = 'none'; document.getElementById('2411.08376v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to ICC 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07751">arXiv:2411.07751</a> <span> [<a href="https://arxiv.org/pdf/2411.07751">pdf</a>, <a href="https://arxiv.org/format/2411.07751">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> SAV-SE: Scene-aware Audio-Visual Speech Enhancement with Selective State Space Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Qian%2C+X">Xinyuan Qian</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+J">Jiaran Gao</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Y">Yaodan Zhang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Q">Qiquan Zhang</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+H">Hexin Liu</a>, <a href="/search/eess?searchtype=author&query=Garcia%2C+L+P">Leibny Paola Garcia</a>, <a href="/search/eess?searchtype=author&query=Li%2C+H">Haizhou Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07751v1-abstract-short" style="display: inline;"> Speech enhancement plays an essential role in various applications, and the integration of visual information has been demonstrated to bring substantial advantages. However, the majority of current research concentrates on the examination of facial and lip movements, which can be compromised or entirely inaccessible in scenarios where occlusions occur or when the camera view is distant. Whereas co… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07751v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07751v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07751v1-abstract-full" style="display: none;"> Speech enhancement plays an essential role in various applications, and the integration of visual information has been demonstrated to bring substantial advantages. However, the majority of current research concentrates on the examination of facial and lip movements, which can be compromised or entirely inaccessible in scenarios where occlusions occur or when the camera view is distant. Whereas contextual visual cues from the surrounding environment have been overlooked: for example, when we see a dog bark, our brain has the innate ability to discern and filter out the barking noise. To this end, in this paper, we introduce a novel task, i.e. SAV-SE. To our best knowledge, this is the first proposal to use rich contextual information from synchronized video as auxiliary cues to indicate the type of noise, which eventually improves the speech enhancement performance. Specifically, we propose the VC-S$^2$E method, which incorporates the Conformer and Mamba modules for their complementary strengths. Extensive experiments are conducted on public MUSIC, AVSpeech and AudioSet datasets, where the results demonstrate the superiority of VC-S$^2$E over other competitive methods. We will make the source code publicly available. Project demo page: https://AVSEPage.github.io/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07751v1-abstract-full').style.display = 'none'; document.getElementById('2411.07751v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07483">arXiv:2411.07483</a> <span> [<a href="https://arxiv.org/pdf/2411.07483">pdf</a>, <a href="https://arxiv.org/format/2411.07483">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Quantifying Knowledge Distillation Using Partial Information Decomposition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Dissanayake%2C+P">Pasan Dissanayake</a>, <a href="/search/eess?searchtype=author&query=Hamman%2C+F">Faisal Hamman</a>, <a href="/search/eess?searchtype=author&query=Halder%2C+B">Barproda Halder</a>, <a href="/search/eess?searchtype=author&query=Sucholutsky%2C+I">Ilia Sucholutsky</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Q">Qiuyi Zhang</a>, <a href="/search/eess?searchtype=author&query=Dutta%2C+S">Sanghamitra Dutta</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07483v1-abstract-short" style="display: inline;"> Knowledge distillation provides an effective method for deploying complex machine learning models in resource-constrained environments. It typically involves training a smaller student model to emulate either the probabilistic outputs or the internal feature representations of a larger teacher model. By doing so, the student model often achieves substantially better performance on a downstream tas… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07483v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07483v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07483v1-abstract-full" style="display: none;"> Knowledge distillation provides an effective method for deploying complex machine learning models in resource-constrained environments. It typically involves training a smaller student model to emulate either the probabilistic outputs or the internal feature representations of a larger teacher model. By doing so, the student model often achieves substantially better performance on a downstream task compared to when it is trained independently. Nevertheless, the teacher's internal representations can also encode noise or additional information that may not be relevant to the downstream task. This observation motivates our primary question: What are the information-theoretic limits of knowledge transfer? To this end, we leverage a body of work in information theory called Partial Information Decomposition (PID) to quantify the distillable and distilled knowledge of a teacher's representation corresponding to a given student and a downstream task. Moreover, we demonstrate that this metric can be practically used in distillation to address challenges caused by the complexity gap between the teacher and the student representations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07483v1-abstract-full').style.display = 'none'; document.getElementById('2411.07483v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at NeurIPS 2024 Machine Learning and Compression Workshop</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07001">arXiv:2411.07001</a> <span> [<a href="https://arxiv.org/pdf/2411.07001">pdf</a>, <a href="https://arxiv.org/format/2411.07001">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> DoF Analysis and Beamforming Design for Active IRS-aided Multi-user MIMO Wireless Communication in Rank-deficient Channels </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Shu%2C+F">Feng Shu</a>, <a href="/search/eess?searchtype=author&query=Jiang%2C+J">Jinbing Jiang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+X">Xuehui Wang</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+K">Ke Yang</a>, <a href="/search/eess?searchtype=author&query=Shen%2C+C">Chong Shen</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Q">Qi Zhang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+D">Dongming Wang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+J">Jiangzhou Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07001v2-abstract-short" style="display: inline;"> Due to its ability of significantly improving data rate, intelligent reflecting surface (IRS) will be a potential crucial technique for the future generation wireless networks like 6G. In this paper, we will focus on the analysis of degree of freedom (DoF) in IRS-aided multi-user MIMO network. Firstly, the DoF upper bound of IRS-aided single-user MIMO network, i.e., the achievable maximum DoF of s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07001v2-abstract-full').style.display = 'inline'; document.getElementById('2411.07001v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07001v2-abstract-full" style="display: none;"> Due to its ability of significantly improving data rate, intelligent reflecting surface (IRS) will be a potential crucial technique for the future generation wireless networks like 6G. In this paper, we will focus on the analysis of degree of freedom (DoF) in IRS-aided multi-user MIMO network. Firstly, the DoF upper bound of IRS-aided single-user MIMO network, i.e., the achievable maximum DoF of such a system, is derived, and the corresponding results are extended to the case of IRS-aided multiuser MIMO by using the matrix rank inequalities. In particular, in serious rank-deficient, also called low-rank, channels like line-of-sight (LoS), the network DoF may doubles over no-IRS with the help of IRS. To verify the rate performance gain from augmented DoF, three closed-form beamforming methods, null-space projection plus maximize transmit power and maximize receive power (NSP-MTP-MRP), Schmidt orthogonalization plus minimum mean square error (SO-MMSE) and two-layer leakage plus MMSE (TLL-MMSE) are proposed to achieve the maximum DoF. Simulation results shows that IRS does make a dramatic rate enhancement. For example, in a serious deficient channel, the sum-rate of the proposed TLL-MMSE aided by IRS is about twice that of no IRS. This means that IRS may achieve a significant DoF improvement in such a channel. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07001v2-abstract-full').style.display = 'none'; document.getElementById('2411.07001v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 9 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06217">arXiv:2411.06217</a> <span> [<a href="https://arxiv.org/pdf/2411.06217">pdf</a>, <a href="https://arxiv.org/format/2411.06217">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Selective State Space Model for Monaural Speech Enhancement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Chen%2C+M">Moran Chen</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Q">Qiquan Zhang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+M">Mingjiang Wang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+X">Xiangyu Zhang</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+H">Hexin Liu</a>, <a href="/search/eess?searchtype=author&query=Ambikairaiah%2C+E">Eliathamby Ambikairaiah</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+D">Deying Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06217v1-abstract-short" style="display: inline;"> Voice user interfaces (VUIs) have facilitated the efficient interactions between humans and machines through spoken commands. Since real-word acoustic scenes are complex, speech enhancement plays a critical role for robust VUI. Transformer and its variants, such as Conformer, have demonstrated cutting-edge results in speech enhancement. However, both of them suffers from the quadratic computationa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06217v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06217v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06217v1-abstract-full" style="display: none;"> Voice user interfaces (VUIs) have facilitated the efficient interactions between humans and machines through spoken commands. Since real-word acoustic scenes are complex, speech enhancement plays a critical role for robust VUI. Transformer and its variants, such as Conformer, have demonstrated cutting-edge results in speech enhancement. However, both of them suffers from the quadratic computational complexity with respect to the sequence length, which hampers their ability to handle long sequences. Recently a novel State Space Model called Mamba, which shows strong capability to handle long sequences with linear complexity, offers a solution to address this challenge. In this paper, we propose a novel hybrid convolution-Mamba backbone, denoted as MambaDC, for speech enhancement. Our MambaDC marries the benefits of convolutional networks to model the local interactions and Mamba's ability for modeling long-range global dependencies. We conduct comprehensive experiments within both basic and state-of-the-art (SoTA) speech enhancement frameworks, on two commonly used training targets. The results demonstrate that MambaDC outperforms Transformer, Conformer, and the standard Mamba across all training targets. Built upon the current advanced framework, the use of MambaDC backbone showcases superior results compared to existing \textcolor{black}{SoTA} systems. This sets the stage for efficient long-range global modeling in speech enhancement. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06217v1-abstract-full').style.display = 'none'; document.getElementById('2411.06217v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to IEEE TCE</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04568">arXiv:2411.04568</a> <span> [<a href="https://arxiv.org/pdf/2411.04568">pdf</a>, <a href="https://arxiv.org/format/2411.04568">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neurons and Cognition">q-bio.NC</span> </div> </div> <p class="title is-5 mathjax"> Dynamic-Attention-based EEG State Transition Modeling for Emotion Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Shen%2C+X">Xinke Shen</a>, <a href="/search/eess?searchtype=author&query=Gan%2C+R">Runmin Gan</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+K">Kaixuan Wang</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+S">Shuyi Yang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Q">Qingzhu Zhang</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+Q">Quanying Liu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+D">Dan Zhang</a>, <a href="/search/eess?searchtype=author&query=Song%2C+S">Sen Song</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04568v1-abstract-short" style="display: inline;"> Electroencephalogram (EEG)-based emotion decoding can objectively quantify people's emotional state and has broad application prospects in human-computer interaction and early detection of emotional disorders. Recently emerging deep learning architectures have significantly improved the performance of EEG emotion decoding. However, existing methods still fall short of fully capturing the complex s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04568v1-abstract-full').style.display = 'inline'; document.getElementById('2411.04568v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04568v1-abstract-full" style="display: none;"> Electroencephalogram (EEG)-based emotion decoding can objectively quantify people's emotional state and has broad application prospects in human-computer interaction and early detection of emotional disorders. Recently emerging deep learning architectures have significantly improved the performance of EEG emotion decoding. However, existing methods still fall short of fully capturing the complex spatiotemporal dynamics of neural signals, which are crucial for representing emotion processing. This study proposes a Dynamic-Attention-based EEG State Transition (DAEST) modeling method to characterize EEG spatiotemporal dynamics. The model extracts spatiotemporal components of EEG that represent multiple parallel neural processes and estimates dynamic attention weights on these components to capture transitions in brain states. The model is optimized within a contrastive learning framework for cross-subject emotion recognition. The proposed method achieved state-of-the-art performance on three publicly available datasets: FACED, SEED, and SEED-V. It achieved 75.4% accuracy in the binary classification of positive and negative emotions and 59.3% in nine-class discrete emotion classification on the FACED dataset, 88.1% in the three-class classification of positive, negative, and neutral emotions on the SEED dataset, and 73.6% in five-class discrete emotion classification on the SEED-V dataset. The learned EEG spatiotemporal patterns and dynamic transition properties offer valuable insights into neural dynamics underlying emotion processing. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04568v1-abstract-full').style.display = 'none'; document.getElementById('2411.04568v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01623">arXiv:2411.01623</a> <span> [<a href="https://arxiv.org/pdf/2411.01623">pdf</a>, <a href="https://arxiv.org/format/2411.01623">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> FilterNet: Harnessing Frequency Filters for Time Series Forecasting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yi%2C+K">Kun Yi</a>, <a href="/search/eess?searchtype=author&query=Fei%2C+J">Jingru Fei</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Q">Qi Zhang</a>, <a href="/search/eess?searchtype=author&query=He%2C+H">Hui He</a>, <a href="/search/eess?searchtype=author&query=Hao%2C+S">Shufeng Hao</a>, <a href="/search/eess?searchtype=author&query=Lian%2C+D">Defu Lian</a>, <a href="/search/eess?searchtype=author&query=Fan%2C+W">Wei Fan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01623v2-abstract-short" style="display: inline;"> While numerous forecasters have been proposed using different network architectures, the Transformer-based models have state-of-the-art performance in time series forecasting. However, forecasters based on Transformers are still suffering from vulnerability to high-frequency signals, efficiency in computation, and bottleneck in full-spectrum utilization, which essentially are the cornerstones for… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01623v2-abstract-full').style.display = 'inline'; document.getElementById('2411.01623v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01623v2-abstract-full" style="display: none;"> While numerous forecasters have been proposed using different network architectures, the Transformer-based models have state-of-the-art performance in time series forecasting. However, forecasters based on Transformers are still suffering from vulnerability to high-frequency signals, efficiency in computation, and bottleneck in full-spectrum utilization, which essentially are the cornerstones for accurately predicting time series with thousands of points. In this paper, we explore a novel perspective of enlightening signal processing for deep time series forecasting. Inspired by the filtering process, we introduce one simple yet effective network, namely FilterNet, built upon our proposed learnable frequency filters to extract key informative temporal patterns by selectively passing or attenuating certain components of time series signals. Concretely, we propose two kinds of learnable filters in the FilterNet: (i) Plain shaping filter, that adopts a universal frequency kernel for signal filtering and temporal modeling; (ii) Contextual shaping filter, that utilizes filtered frequencies examined in terms of its compatibility with input signals for dependency learning. Equipped with the two filters, FilterNet can approximately surrogate the linear and attention mappings widely adopted in time series literature, while enjoying superb abilities in handling high-frequency noises and utilizing the whole frequency spectrum that is beneficial for forecasting. Finally, we conduct extensive experiments on eight time series forecasting benchmarks, and experimental results have demonstrated our superior performance in terms of both effectiveness and efficiency compared with state-of-the-art methods. Code is available at this repository: https://github.com/aikunyi/FilterNet <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01623v2-abstract-full').style.display = 'none'; document.getElementById('2411.01623v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.24039">arXiv:2410.24039</a> <span> [<a href="https://arxiv.org/pdf/2410.24039">pdf</a>, <a href="https://arxiv.org/format/2410.24039">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Efficient Satellite-Ground Interconnection Design for Low-orbit Mega-Constellation Topology </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Liu%2C+W">Wenhao Liu</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+J">Jiazhi Wu</a>, <a href="/search/eess?searchtype=author&query=Lin%2C+Q">Quanwei Lin</a>, <a href="/search/eess?searchtype=author&query=Luo%2C+H">Handong Luo</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Q">Qi Zhang</a>, <a href="/search/eess?searchtype=author&query=Qiu%2C+K">Kun Qiu</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Z">Zhe Chen</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+Y">Yue Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.24039v1-abstract-short" style="display: inline;"> The low-orbit mega-constellation network (LMCN) is an important part of the space-air-ground integrated network system. An effective satellite-ground interconnection design can result in a stable constellation topology for LMCNs. A naive solution is accessing the satellite with the longest remaining service time (LRST), which is widely used in previous designs. The Coordinated Satellite-Ground Int… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.24039v1-abstract-full').style.display = 'inline'; document.getElementById('2410.24039v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.24039v1-abstract-full" style="display: none;"> The low-orbit mega-constellation network (LMCN) is an important part of the space-air-ground integrated network system. An effective satellite-ground interconnection design can result in a stable constellation topology for LMCNs. A naive solution is accessing the satellite with the longest remaining service time (LRST), which is widely used in previous designs. The Coordinated Satellite-Ground Interconnecting (CSGI), the state-of-the-art algorithm, coordinates the establishment of ground-satellite links (GSLs). Compared with existing solutions, it reduces latency by 19% and jitter by 70% on average. However, CSGI only supports the scenario where terminals access only one satellite and cannot fully utilize the multi-access capabilities of terminals. Additionally, CSGI's high computational complexity poses deployment challenges. To overcome these problems, we propose the Classification-based Longest Remaining Service Time (C-LRST) algorithm. C-LRST supports the actual scenario with multi-access capabilities. It adds optional paths during routing with low computational complexity, improving end-to-end communications quality. We conduct our 1000s simulation from Brazil to Lithuania on the open-source platform Hypatia. Experiment results show that compared with CSGI, C-LRST reduces the latency and increases the throughput by approximately 60% and 40%, respectively. In addition, C-LRST's GSL switching number is 14, whereas CSGI is 23. C-LRST has better link stability than CSGI. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.24039v1-abstract-full').style.display = 'none'; document.getElementById('2410.24039v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 14 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.19880">arXiv:2410.19880</a> <span> [<a href="https://arxiv.org/pdf/2410.19880">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Implementing Deep Reinforcement Learning-Based Grid Voltage Control in Real-World Power Systems: Challenges and Insights </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Shi%2C+D">Di Shi</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Q">Qiang Zhang</a>, <a href="/search/eess?searchtype=author&query=Hong%2C+M">Mingguo Hong</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+F">Fengyu Wang</a>, <a href="/search/eess?searchtype=author&query=Maslennikov%2C+S">Slava Maslennikov</a>, <a href="/search/eess?searchtype=author&query=Luo%2C+X">Xiaochuan Luo</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yize Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.19880v1-abstract-short" style="display: inline;"> Deep reinforcement learning (DRL) holds significant promise for managing voltage control challenges in simulated power grid environments. However, its real-world application in power system operations remains underexplored. This study rigorously evaluates DRL's performance and limitations within actual operational contexts by utilizing detailed experiments across the IEEE 14-bus system, Illinois 2… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19880v1-abstract-full').style.display = 'inline'; document.getElementById('2410.19880v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.19880v1-abstract-full" style="display: none;"> Deep reinforcement learning (DRL) holds significant promise for managing voltage control challenges in simulated power grid environments. However, its real-world application in power system operations remains underexplored. This study rigorously evaluates DRL's performance and limitations within actual operational contexts by utilizing detailed experiments across the IEEE 14-bus system, Illinois 200-bus system, and the ISO New England node-breaker model. Our analysis critically assesses DRL's effectiveness for grid control from a system operator's perspective, identifying specific performance bottlenecks. The findings provide actionable insights that highlight the necessity of advancing AI technologies to effectively address the growing complexities of modern power systems. This research underscores the vital role of DRL in enhancing grid management and reliability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19880v1-abstract-full').style.display = 'none'; document.getElementById('2410.19880v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 9 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.19559">arXiv:2410.19559</a> <span> [<a href="https://arxiv.org/pdf/2410.19559">pdf</a>, <a href="https://arxiv.org/format/2410.19559">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Flexibility Options: A Proposed Product for Managing Imbalance Risk </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Spyrou%2C+E">Elina Spyrou</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Q">Qiwei Zhang</a>, <a href="/search/eess?searchtype=author&query=Hytowitz%2C+R+B">Robin B. Hytowitz</a>, <a href="/search/eess?searchtype=author&query=Hobbs%2C+B+F">Ben F. Hobbs</a>, <a href="/search/eess?searchtype=author&query=Tyagi%2C+S">Siddharth Tyagi</a>, <a href="/search/eess?searchtype=author&query=Cai%2C+M">Mengmeng Cai</a>, <a href="/search/eess?searchtype=author&query=Blonsky%2C+M">Michael Blonsky</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.19559v1-abstract-short" style="display: inline;"> The presence of variable renewable energy resources with uncertain outputs in day-ahead electricity markets results in additional balancing needs in real-time. Addressing those needs cost-effectively and reliably within a competitive market with unbundled products is challenging as both the demand for and the availability of flexibility depends on day-ahead energy schedules. Existing approaches fo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19559v1-abstract-full').style.display = 'inline'; document.getElementById('2410.19559v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.19559v1-abstract-full" style="display: none;"> The presence of variable renewable energy resources with uncertain outputs in day-ahead electricity markets results in additional balancing needs in real-time. Addressing those needs cost-effectively and reliably within a competitive market with unbundled products is challenging as both the demand for and the availability of flexibility depends on day-ahead energy schedules. Existing approaches for reserve procurement usually rely either on oversimplified demand curves that do not consider how system conditions that particular day affect the value of flexibility, or on bilateral trading of hedging instruments that are not co-optimized with day-ahead schedules. This article proposes a new product, `Flexibility Options', to address these two limitations. The demand for this product is endogenously determined in the day-ahead market and it is met cost-effectively by considering real-time supply curves for product providers, which are co-optimized with the energy supply. As we illustrate with numerical examples and mathematical analysis, the product addresses the hedging needs of participants with imbalances, provides a less intermittent revenue stream for participants with flexible outputs, promotes convergence between day-ahead and real-time energy prices, and ensures that the system operator is revenue-neutral. This article provides a comprehensive design that can be further tested and applied in large-scale systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19559v1-abstract-full').style.display = 'none'; document.getElementById('2410.19559v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.19452">arXiv:2410.19452</a> <span> [<a href="https://arxiv.org/pdf/2410.19452">pdf</a>, <a href="https://arxiv.org/format/2410.19452">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> NeuroClips: Towards High-fidelity and Smooth fMRI-to-Video Reconstruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Gong%2C+Z">Zixuan Gong</a>, <a href="/search/eess?searchtype=author&query=Bao%2C+G">Guangyin Bao</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Q">Qi Zhang</a>, <a href="/search/eess?searchtype=author&query=Wan%2C+Z">Zhongwei Wan</a>, <a href="/search/eess?searchtype=author&query=Miao%2C+D">Duoqian Miao</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+S">Shoujin Wang</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+L">Lei Zhu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+C">Changwei Wang</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+R">Rongtao Xu</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+L">Liang Hu</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+K">Ke Liu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Y">Yu Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.19452v2-abstract-short" style="display: inline;"> Reconstruction of static visual stimuli from non-invasion brain activity fMRI achieves great success, owning to advanced deep learning models such as CLIP and Stable Diffusion. However, the research on fMRI-to-video reconstruction remains limited since decoding the spatiotemporal perception of continuous visual experiences is formidably challenging. We contend that the key to addressing these chal… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19452v2-abstract-full').style.display = 'inline'; document.getElementById('2410.19452v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.19452v2-abstract-full" style="display: none;"> Reconstruction of static visual stimuli from non-invasion brain activity fMRI achieves great success, owning to advanced deep learning models such as CLIP and Stable Diffusion. However, the research on fMRI-to-video reconstruction remains limited since decoding the spatiotemporal perception of continuous visual experiences is formidably challenging. We contend that the key to addressing these challenges lies in accurately decoding both high-level semantics and low-level perception flows, as perceived by the brain in response to video stimuli. To the end, we propose NeuroClips, an innovative framework to decode high-fidelity and smooth video from fMRI. NeuroClips utilizes a semantics reconstructor to reconstruct video keyframes, guiding semantic accuracy and consistency, and employs a perception reconstructor to capture low-level perceptual details, ensuring video smoothness. During inference, it adopts a pre-trained T2V diffusion model injected with both keyframes and low-level perception flows for video reconstruction. Evaluated on a publicly available fMRI-video dataset, NeuroClips achieves smooth high-fidelity video reconstruction of up to 6s at 8FPS, gaining significant improvements over state-of-the-art models in various metrics, e.g., a 128% improvement in SSIM and an 81% improvement in spatiotemporal metrics. Our project is available at https://github.com/gongzix/NeuroClips. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19452v2-abstract-full').style.display = 'none'; document.getElementById('2410.19452v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2024 Oral</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.19383">arXiv:2410.19383</a> <span> [<a href="https://arxiv.org/pdf/2410.19383">pdf</a>, <a href="https://arxiv.org/format/2410.19383">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> A Modulo Sampling Hardware Prototype and Reconstruction Algorithm Evaluation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhu%2C+J">Jiang Zhu</a>, <a href="/search/eess?searchtype=author&query=Ma%2C+J">Junnan Ma</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+Z">Zhenlong Liu</a>, <a href="/search/eess?searchtype=author&query=Qu%2C+F">Fengzhong Qu</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+Z">Zheng Zhu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Q">Qi Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.19383v1-abstract-short" style="display: inline;"> Analog-to-digital converters (ADCs) play a vital important role in any devices via manipulating analog signals in a digital manner. Given that the amplitude of the signal exceeds the dynamic range of the ADCs, clipping occurs and the quality of the digitized signal degrades significantly. In this paper, we design a joint modulo sampling hardware and processing prototype which improves the ADCs' dy… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19383v1-abstract-full').style.display = 'inline'; document.getElementById('2410.19383v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.19383v1-abstract-full" style="display: none;"> Analog-to-digital converters (ADCs) play a vital important role in any devices via manipulating analog signals in a digital manner. Given that the amplitude of the signal exceeds the dynamic range of the ADCs, clipping occurs and the quality of the digitized signal degrades significantly. In this paper, we design a joint modulo sampling hardware and processing prototype which improves the ADCs' dynamic range by folding the signal before sampling. Both the detailed design of the hardware and the recovery results of various state-of-the-art processing algorithms including our proposed unlimited sampling line spectral estimation (USLSE) algorithm are presented. Additionally, key issues that arise during implementation are also addressed. It is demonstrated that the USLSE algorithm successfully recovers the original signal with a frequency of 2.5 kHz and an amplitude 10 times the ADC's dynamic range, and the linear prediction (LP) algorithm successfully recovers the original signal with a frequency of 3.5 kHz and an amplitude 10 times the ADC's dynamic range. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19383v1-abstract-full').style.display = 'none'; document.getElementById('2410.19383v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.17799">arXiv:2410.17799</a> <span> [<a href="https://arxiv.org/pdf/2410.17799">pdf</a>, <a href="https://arxiv.org/format/2410.17799">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> OmniFlatten: An End-to-end GPT Model for Seamless Voice Conversation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+Q">Qinglin Zhang</a>, <a href="/search/eess?searchtype=author&query=Cheng%2C+L">Luyao Cheng</a>, <a href="/search/eess?searchtype=author&query=Deng%2C+C">Chong Deng</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Q">Qian Chen</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+W">Wen Wang</a>, <a href="/search/eess?searchtype=author&query=Zheng%2C+S">Siqi Zheng</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+J">Jiaqing Liu</a>, <a href="/search/eess?searchtype=author&query=Yu%2C+H">Hai Yu</a>, <a href="/search/eess?searchtype=author&query=Tan%2C+C">Chaohong Tan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.17799v1-abstract-short" style="display: inline;"> Full-duplex spoken dialogue systems significantly advance over traditional turn-based dialogue systems, as they allow simultaneous bidirectional communication, closely mirroring human-human interactions. However, achieving low latency and natural interactions in full-duplex dialogue systems remains a significant challenge, especially considering human conversation dynamics such as interruptions, b… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17799v1-abstract-full').style.display = 'inline'; document.getElementById('2410.17799v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.17799v1-abstract-full" style="display: none;"> Full-duplex spoken dialogue systems significantly advance over traditional turn-based dialogue systems, as they allow simultaneous bidirectional communication, closely mirroring human-human interactions. However, achieving low latency and natural interactions in full-duplex dialogue systems remains a significant challenge, especially considering human conversation dynamics such as interruptions, backchannels, and overlapping speech. In this paper, we introduce a novel End-to-End GPT-based model OmniFlatten for full-duplex conversation, capable of effectively modeling the complex behaviors inherent to natural conversations with low latency. To achieve full-duplex communication capabilities, we propose a multi-stage post-training scheme that progressively adapts a text-based large language model (LLM) backbone into a speech-text dialogue LLM, capable of generating text and speech in real time, without modifying the architecture of the backbone LLM. The training process comprises three stages: modality alignment, half-duplex dialogue learning, and full-duplex dialogue learning. Throughout all training stages, we standardize the data using a flattening operation, which allows us to unify the training methods and the model architecture across different modalities and tasks. Our approach offers a straightforward modeling technique and a promising research direction for developing efficient and natural end-to-end full-duplex spoken dialogue systems. Audio samples of dialogues generated by OmniFlatten can be found at this web site (https://omniflatten.github.io/). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17799v1-abstract-full').style.display = 'none'; document.getElementById('2410.17799v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Work in progress</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.14934">arXiv:2410.14934</a> <span> [<a href="https://arxiv.org/pdf/2410.14934">pdf</a>, <a href="https://arxiv.org/format/2410.14934">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Development of a Simple and Novel Digital Twin Framework for Industrial Robots in Intelligent robotics manufacturing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Xiang%2C+T">Tianyi Xiang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+B">Borui Li</a>, <a href="/search/eess?searchtype=author&query=Pan%2C+X">Xin Pan</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Q">Quan Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.14934v1-abstract-short" style="display: inline;"> This paper has proposed an easily replicable and novel approach for developing a Digital Twin (DT) system for industrial robots in intelligent manufacturing applications. Our framework enables effective communication via Robot Web Service (RWS), while a real-time simulation is implemented in Unity 3D and Web-based Platform without any other 3rd party tools. The framework can do real-time visualiza… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14934v1-abstract-full').style.display = 'inline'; document.getElementById('2410.14934v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.14934v1-abstract-full" style="display: none;"> This paper has proposed an easily replicable and novel approach for developing a Digital Twin (DT) system for industrial robots in intelligent manufacturing applications. Our framework enables effective communication via Robot Web Service (RWS), while a real-time simulation is implemented in Unity 3D and Web-based Platform without any other 3rd party tools. The framework can do real-time visualization and control of the entire work process, as well as implement real-time path planning based on algorithms executed in MATLAB. Results verify the high communication efficiency with a refresh rate of only $17 ms$. Furthermore, our developed web-based platform and Graphical User Interface (GUI) enable easy accessibility and user-friendliness in real-time control. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14934v1-abstract-full').style.display = 'none'; document.getElementById('2410.14934v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> 20th International Conference on Automation Science and Engineering (CASE 2024) </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.14928">arXiv:2410.14928</a> <span> [<a href="https://arxiv.org/pdf/2410.14928">pdf</a>, <a href="https://arxiv.org/format/2410.14928">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> A Novel Approach to Grasping Control of Soft Robotic Grippers based on Digital Twin </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Xiang%2C+T">Tianyi Xiang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+B">Borui Li</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Q">Quan Zhang</a>, <a href="/search/eess?searchtype=author&query=Leach%2C+M">Mark Leach</a>, <a href="/search/eess?searchtype=author&query=Lim%2C+E+G">Eng Gee Lim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.14928v1-abstract-short" style="display: inline;"> This paper has proposed a Digital Twin (DT) framework for real-time motion and pose control of soft robotic grippers. The developed DT is based on an industrial robot workstation, integrated with our newly proposed approach for soft gripper control, primarily based on computer vision, for setting the driving pressure for desired gripper status in real-time. Knowing the gripper motion, the gripper… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14928v1-abstract-full').style.display = 'inline'; document.getElementById('2410.14928v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.14928v1-abstract-full" style="display: none;"> This paper has proposed a Digital Twin (DT) framework for real-time motion and pose control of soft robotic grippers. The developed DT is based on an industrial robot workstation, integrated with our newly proposed approach for soft gripper control, primarily based on computer vision, for setting the driving pressure for desired gripper status in real-time. Knowing the gripper motion, the gripper parameters (e.g. curvatures and bending angles, etc.) are simulated by kinematics modelling in Unity 3D, which is based on four-piecewise constant curvature kinematics. The mapping in between the driving pressure and gripper parameters is achieved by implementing OpenCV based image processing algorithms and data fitting. Results show that our DT-based approach can achieve satisfactory performance in real-time control of soft gripper manipulation, which can satisfy a wide range of industrial applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14928v1-abstract-full').style.display = 'none'; document.getElementById('2410.14928v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> 29th International Conference on Automation and Computing (ICAC 2024) </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.14422">arXiv:2410.14422</a> <span> [<a href="https://arxiv.org/pdf/2410.14422">pdf</a>, <a href="https://arxiv.org/format/2410.14422">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Deep Uncertainty-aware Tracking for Maneuvering Targets </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+S">Shuyang Zhang</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+C">Chang Gao</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Q">Qingfu Zhang</a>, <a href="/search/eess?searchtype=author&query=Jia%2C+T">Tianyi Jia</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+H">Hongwei Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.14422v1-abstract-short" style="display: inline;"> When tracking maneuvering targets, model-driven approaches encounter difficulties in comprehensively delineating complex real-world scenarios and are prone to model mismatch when the targets maneuver. Meanwhile, contemporary data-driven methods have overlooked measurements' confidence, markedly escalating the challenge of fitting a mapping from measurement sequences to target state sequences. To a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14422v1-abstract-full').style.display = 'inline'; document.getElementById('2410.14422v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.14422v1-abstract-full" style="display: none;"> When tracking maneuvering targets, model-driven approaches encounter difficulties in comprehensively delineating complex real-world scenarios and are prone to model mismatch when the targets maneuver. Meanwhile, contemporary data-driven methods have overlooked measurements' confidence, markedly escalating the challenge of fitting a mapping from measurement sequences to target state sequences. To address these issues, this paper presents a deep maneuvering target tracking methodology based on target state space projection. The proposed methodology initially establishes a projection from the target measurement sequence to the target state space by formulating the probability density function of measurement error and samples the distribution information of measurement noise within the target state space as a measurement representation. Under this representation, the sequential regression task of target state estimation can be transmuted into a task of detecting the target location in the state space. Subsequently, a deep detection network is devised to accomplish target location detection in the target state space. Finally, a loss function is designed to facilitate the network's training for attaining the desired network performance. Simulation experiments suggest that the proposed method can maintain satisfactory tracking performance even when the target maneuvers, and can rapidly converge and achieve higher estimation accuracy compared with existing methods after the target maneuvers. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14422v1-abstract-full').style.display = 'none'; document.getElementById('2410.14422v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.14287">arXiv:2410.14287</a> <span> [<a href="https://arxiv.org/pdf/2410.14287">pdf</a>, <a href="https://arxiv.org/format/2410.14287">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Compression using Discrete Multi-Level Divisor Transform for Heterogeneous Sensor Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Kuldeep%2C+G">Gajraj Kuldeep</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Q">Qi Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.14287v1-abstract-short" style="display: inline;"> In recent years, multiple sensor-based devices and systems have been deployed in smart agriculture, industrial automation, E-Health, etc. The diversity of sensor data types and the amount of data pose critical challenges for data transmission and storage. The conventional data compression methods are tuned for a data type, e.g., OGG for audio. Due to such limitations, traditional compression algor… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14287v1-abstract-full').style.display = 'inline'; document.getElementById('2410.14287v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.14287v1-abstract-full" style="display: none;"> In recent years, multiple sensor-based devices and systems have been deployed in smart agriculture, industrial automation, E-Health, etc. The diversity of sensor data types and the amount of data pose critical challenges for data transmission and storage. The conventional data compression methods are tuned for a data type, e.g., OGG for audio. Due to such limitations, traditional compression algorithms may not be suitable for a system with multiple sensors. In this paper, we present a novel transform named as discrete multi-level divisor transform (DMDT). A signal compression algorithm is proposed for one-dimensional signals using the DMDT. The universality of the proposed compression algorithm is demonstrated by considering various types of signals, such as audio, electrocardiogram, accelerometer, magnetometer, photoplethysmography, and gyroscope. The proposed DMDT-based signal compression algorithm is also compared with the state-of-the-art compression algorithms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14287v1-abstract-full').style.display = 'none'; document.getElementById('2410.14287v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13436">arXiv:2410.13436</a> <span> [<a href="https://arxiv.org/pdf/2410.13436">pdf</a>, <a href="https://arxiv.org/format/2410.13436">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Multi-frame Detection via Graph Neural Networks: A Link Prediction Approach </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Lin%2C+Z">Zhihao Lin</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+C">Chang Gao</a>, <a href="/search/eess?searchtype=author&query=Yan%2C+J">Junkun Yan</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Q">Qingfu Zhang</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+H">Hongwei Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13436v2-abstract-short" style="display: inline;"> Multi-frame detection algorithms can effectively utilize the correlation between consecutive echoes to improve the detection performance of weak targets. Existing efficient multi-frame detection algorithms are typically based on three sequential steps: plot extraction via a relative low primary threshold, track search and track detection. However, these three-stage processing algorithms may result… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13436v2-abstract-full').style.display = 'inline'; document.getElementById('2410.13436v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13436v2-abstract-full" style="display: none;"> Multi-frame detection algorithms can effectively utilize the correlation between consecutive echoes to improve the detection performance of weak targets. Existing efficient multi-frame detection algorithms are typically based on three sequential steps: plot extraction via a relative low primary threshold, track search and track detection. However, these three-stage processing algorithms may result in a notable loss of detection performance and do not fully leverage the available echo information across frames. As to applying graph neural networks in multi-frame detection, the algorithms are primarily based on node classification tasks, which cannot directly output target tracks. In this paper, we reformulate the multi-frame detection problem as a link prediction task in graphs. First, we perform a rough association of multi-frame observations that exceed the low threshold to construct observation association graphs. Subsequently, a multi-feature link prediction network is designed based on graph neural networks, which integrates multi-dimensional information, including echo structure, Doppler information, and spatio-temporal coupling of plots. By leveraging the principle of link prediction, we unifies the processes of track search and track detection into one step to reduce performance loss and directly output target tracks. Experimental results indicate that, compared with traditional single-frame and multi-frame detection algorithms, the proposed algorithm improves the detection performance of weak targets while suppressing false alarms. Additionally, interpretable analysis shows that the designed network effectively integrates the utilized features, allowing for accurate associations between targets and false alarms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13436v2-abstract-full').style.display = 'none'; document.getElementById('2410.13436v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13219">arXiv:2410.13219</a> <span> [<a href="https://arxiv.org/pdf/2410.13219">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Fundamental Limits of Pulse Based UWB ISAC Systems: A Parameter Estimation Perspective </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Liu%2C+F">Fan Liu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+T">Tingting Zhang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Z">Zenan Zhang</a>, <a href="/search/eess?searchtype=author&query=Cao%2C+B">Bin Cao</a>, <a href="/search/eess?searchtype=author&query=Shen%2C+Y">Yuan Shen</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Q">Qinyu Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13219v1-abstract-short" style="display: inline;"> Impulse radio ultra-wideband (IR-UWB) signals stand out for their high temporal resolution, low cost, and large bandwidth, making them a highly promising option for integrated sensing and communication (ISAC) systems. In this paper, we design an ISAC system for a bi-static passive sensing scenario that accommodates multiple targets. Specifically, we introduce two typical modulation schemes, PPM an… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13219v1-abstract-full').style.display = 'inline'; document.getElementById('2410.13219v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13219v1-abstract-full" style="display: none;"> Impulse radio ultra-wideband (IR-UWB) signals stand out for their high temporal resolution, low cost, and large bandwidth, making them a highly promising option for integrated sensing and communication (ISAC) systems. In this paper, we design an ISAC system for a bi-static passive sensing scenario that accommodates multiple targets. Specifically, we introduce two typical modulation schemes, PPM and BPSK, for data transmission. The essential coupling between sensing and communication is examined through the Fisher information matrix (FIM). Accordingly, we introduce a pilot-based decoupling approach that relies on known time-delays, as well as a differential decoupling strategy that uses a known starting symbol position. Finally, we assess the sensing and communication performance under various modulation and demodulation schemes under the constraints of current UWB standards. This assessment utilizes the Cramer-Rao Lower Bound (CRLB) for sensing and the Shannon capacity limit for communication, offering theoretical insights into choosing suitable data signal processing methods in real-world applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13219v1-abstract-full').style.display = 'none'; document.getElementById('2410.13219v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.19370">arXiv:2409.19370</a> <span> [<a href="https://arxiv.org/pdf/2409.19370">pdf</a>, <a href="https://arxiv.org/ps/2409.19370">ps</a>, <a href="https://arxiv.org/format/2409.19370">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MambaEviScrib: Mamba and Evidence-Guided Consistency Enhance CNN Robustness for Scribble-Based Weakly Supervised Ultrasound Image Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Han%2C+X">Xiaoxiang Han</a>, <a href="/search/eess?searchtype=author&query=Li%2C+X">Xinyu Li</a>, <a href="/search/eess?searchtype=author&query=Shang%2C+J">Jiang Shang</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+Y">Yiman Liu</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+K">Keyan Chen</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+S">Shugong Xu</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+Q">Qiaohong Liu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Q">Qi Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.19370v2-abstract-short" style="display: inline;"> Segmenting anatomical structures and lesions from ultrasound images contributes to disease assessment. Weakly supervised learning (WSL) based on sparse annotation has achieved encouraging performance and demonstrated the potential to reduce annotation costs. This study attempts to introduce scribble-based WSL into ultrasound image segmentation tasks. However, ultrasound images often suffer from po… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19370v2-abstract-full').style.display = 'inline'; document.getElementById('2409.19370v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.19370v2-abstract-full" style="display: none;"> Segmenting anatomical structures and lesions from ultrasound images contributes to disease assessment. Weakly supervised learning (WSL) based on sparse annotation has achieved encouraging performance and demonstrated the potential to reduce annotation costs. This study attempts to introduce scribble-based WSL into ultrasound image segmentation tasks. However, ultrasound images often suffer from poor contrast and unclear edges, coupled with insufficient supervison signals for edges, posing challenges to edge prediction. Uncertainty modeling has been proven to facilitate models in dealing with these issues. Nevertheless, existing uncertainty estimation paradigms are not robust enough and often filter out predictions near decision boundaries, resulting in unstable edge predictions. Therefore, we propose leveraging predictions near decision boundaries effectively. Specifically, we introduce Dempster-Shafer Theory (DST) of evidence to design an Evidence-Guided Consistency strategy. This strategy utilizes high-evidence predictions, which are more likely to occur near high-density regions, to guide the optimization of low-evidence predictions that may appear near decision boundaries. Furthermore, the diverse sizes and locations of lesions in ultrasound images pose a challenge for CNNs with local receptive fields, as they struggle to model global information. Therefore, we introduce Visual Mamba based on structured state space sequence models, which achieves long-range dependency with linear computational complexity, and we construct a novel hybrid CNN-Mamba framework. During training, the collaboration between the CNN branch and the Mamba branch in the proposed framework draws inspiration from each other based on the EGC strategy. Experiments demonstrate the competitiveness of the proposed method. Dataset and code will be available on https://github.com/GtLinyer/MambaEviScrib. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19370v2-abstract-full').style.display = 'none'; document.getElementById('2409.19370v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.18558">arXiv:2409.18558</a> <span> [<a href="https://arxiv.org/pdf/2409.18558">pdf</a>, <a href="https://arxiv.org/format/2409.18558">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> XWSB: A Blend System Utilizing XLS-R and WavLM with SLS Classifier detection system for SVDD 2024 Challenge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+Q">Qishan Zhang</a>, <a href="/search/eess?searchtype=author&query=Wen%2C+S">Shuangbing Wen</a>, <a href="/search/eess?searchtype=author&query=Yan%2C+F">Fangke Yan</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+T">Tao Hu</a>, <a href="/search/eess?searchtype=author&query=Li%2C+J">Jun Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.18558v1-abstract-short" style="display: inline;"> This paper introduces the model structure used in the SVDD 2024 Challenge. The SVDD 2024 challenge has been introduced this year for the first time. Singing voice deepfake detection (SVDD) which faces complexities due to informal speech intonations and varying speech rates. In this paper, we propose the XWSB system, which achieved SOTA per-formance in the SVDD challenge. XWSB stands for XLS-R, Wav… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18558v1-abstract-full').style.display = 'inline'; document.getElementById('2409.18558v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.18558v1-abstract-full" style="display: none;"> This paper introduces the model structure used in the SVDD 2024 Challenge. The SVDD 2024 challenge has been introduced this year for the first time. Singing voice deepfake detection (SVDD) which faces complexities due to informal speech intonations and varying speech rates. In this paper, we propose the XWSB system, which achieved SOTA per-formance in the SVDD challenge. XWSB stands for XLS-R, WavLM, and SLS Blend, representing the integration of these technologies for the purpose of SVDD. Specifically, we used the best performing model structure XLS-R&SLS from the ASVspoof DF dataset, and applied SLS to WavLM to form the WavLM&SLS structure. Finally, we integrated two models to form the XWSB system. Experimental results show that our system demonstrates advanced recognition capabilities in the SVDD challenge, specifically achieving an EER of 2.32% in the CtrSVDD track. The code and data can be found at https://github.com/QiShanZhang/XWSB_for_ SVDD2024. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18558v1-abstract-full').style.display = 'none'; document.getElementById('2409.18558v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> IEEE Spoken Language Technology Workshop 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.15816">arXiv:2409.15816</a> <span> [<a href="https://arxiv.org/pdf/2409.15816">pdf</a>, <a href="https://arxiv.org/format/2409.15816">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Diffusion Models for Intelligent Transportation Systems: A Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Peng%2C+M">Mingxing Peng</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+K">Kehua Chen</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+X">Xusen Guo</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Q">Qiming Zhang</a>, <a href="/search/eess?searchtype=author&query=Lu%2C+H">Hongliang Lu</a>, <a href="/search/eess?searchtype=author&query=Zhong%2C+H">Hui Zhong</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+D">Di Chen</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+M">Meixin Zhu</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+H">Hai Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.15816v2-abstract-short" style="display: inline;"> Intelligent Transportation Systems (ITS) are vital in modern traffic management and optimization, significantly enhancing traffic efficiency and safety. Recently, diffusion models have emerged as transformative tools for addressing complex challenges within ITS. In this paper, we present a comprehensive survey of diffusion models for ITS, covering both theoretical and practical aspects. First, we… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15816v2-abstract-full').style.display = 'inline'; document.getElementById('2409.15816v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.15816v2-abstract-full" style="display: none;"> Intelligent Transportation Systems (ITS) are vital in modern traffic management and optimization, significantly enhancing traffic efficiency and safety. Recently, diffusion models have emerged as transformative tools for addressing complex challenges within ITS. In this paper, we present a comprehensive survey of diffusion models for ITS, covering both theoretical and practical aspects. First, we introduce the theoretical foundations of diffusion models and their key variants, including conditional diffusion models and latent diffusion models, highlighting their suitability for modeling complex, multi-modal traffic data and enabling controllable generation. Second, we outline the primary challenges in ITS and the corresponding advantages of diffusion models, providing readers with a deeper understanding of the intersection between ITS and diffusion models. Third, we offer a multi-perspective investigation of current applications of diffusion models in ITS domains, including autonomous driving, traffic simulation, trajectory prediction, and traffic safety. Finally, we discuss state-of-the-art diffusion model techniques and highlight key ITS research directions that warrant further investigation. Through this structured overview, we aim to provide researchers with a comprehensive understanding of diffusion models for ITS, thereby advancing their future applications in the transportation domain. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15816v2-abstract-full').style.display = 'none'; document.getElementById('2409.15816v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.15061">arXiv:2409.15061</a> <span> [<a href="https://arxiv.org/pdf/2409.15061">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Cloud Deployment of Large-Scale Electromagnetic Transient Simulation -- Discovery and Experiences </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Luo%2C+X">Xiaochuan Luo</a>, <a href="/search/eess?searchtype=author&query=Ploof%2C+J">Jason Ploof</a>, <a href="/search/eess?searchtype=author&query=Fang%2C+X">Xinghao Fang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Q">Qiang Zhang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+S">Song Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.15061v1-abstract-short" style="display: inline;"> Electromagnetic Transient (EMT) simulation starts to play a critical role in modern power system planning and operations due to large penetration of inverter based resources (IBRs). The EMT studies are computationally intensive due to very small simulation time step and complex modeling of the protection and control of IBRs. It has been challenging for the traditional on-premises computing infrast… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15061v1-abstract-full').style.display = 'inline'; document.getElementById('2409.15061v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.15061v1-abstract-full" style="display: none;"> Electromagnetic Transient (EMT) simulation starts to play a critical role in modern power system planning and operations due to large penetration of inverter based resources (IBRs). The EMT studies are computationally intensive due to very small simulation time step and complex modeling of the protection and control of IBRs. It has been challenging for the traditional on-premises computing infrastructure to meet the ever-increasing computing needs of large-scale EMT studies. This paper shares experience of ISO New England (ISO-NE) on a pilot deployment of EMT simulation in a public cloud using Amazon Web Services. The platform can successfully meet the large-scale EMT simulation computation needs in a cost-effective way while meeting cyber security and data privacy requirements. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15061v1-abstract-full').style.display = 'none'; document.getElementById('2409.15061v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">6 pages, 4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.00750">arXiv:2409.00750</a> <span> [<a href="https://arxiv.org/pdf/2409.00750">pdf</a>, <a href="https://arxiv.org/format/2409.00750">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> MaskGCT: Zero-Shot Text-to-Speech with Masked Generative Codec Transformer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yuancheng Wang</a>, <a href="/search/eess?searchtype=author&query=Zhan%2C+H">Haoyue Zhan</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+L">Liwei Liu</a>, <a href="/search/eess?searchtype=author&query=Zeng%2C+R">Ruihong Zeng</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+H">Haotian Guo</a>, <a href="/search/eess?searchtype=author&query=Zheng%2C+J">Jiachen Zheng</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Q">Qiang Zhang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+X">Xueyao Zhang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+S">Shunsi Zhang</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+Z">Zhizheng Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.00750v3-abstract-short" style="display: inline;"> The recent large-scale text-to-speech (TTS) systems are usually grouped as autoregressive and non-autoregressive systems. The autoregressive systems implicitly model duration but exhibit certain deficiencies in robustness and lack of duration controllability. Non-autoregressive systems require explicit alignment information between text and speech during training and predict durations for linguist… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00750v3-abstract-full').style.display = 'inline'; document.getElementById('2409.00750v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.00750v3-abstract-full" style="display: none;"> The recent large-scale text-to-speech (TTS) systems are usually grouped as autoregressive and non-autoregressive systems. The autoregressive systems implicitly model duration but exhibit certain deficiencies in robustness and lack of duration controllability. Non-autoregressive systems require explicit alignment information between text and speech during training and predict durations for linguistic units (e.g. phone), which may compromise their naturalness. In this paper, we introduce Masked Generative Codec Transformer (MaskGCT), a fully non-autoregressive TTS model that eliminates the need for explicit alignment information between text and speech supervision, as well as phone-level duration prediction. MaskGCT is a two-stage model: in the first stage, the model uses text to predict semantic tokens extracted from a speech self-supervised learning (SSL) model, and in the second stage, the model predicts acoustic tokens conditioned on these semantic tokens. MaskGCT follows the mask-and-predict learning paradigm. During training, MaskGCT learns to predict masked semantic or acoustic tokens based on given conditions and prompts. During inference, the model generates tokens of a specified length in a parallel manner. Experiments with 100K hours of in-the-wild speech demonstrate that MaskGCT outperforms the current state-of-the-art zero-shot TTS systems in terms of quality, similarity, and intelligibility. Audio samples are available at https://maskgct.github.io/. We release our code and model checkpoints at https://github.com/open-mmlab/Amphion/blob/main/models/tts/maskgct. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00750v3-abstract-full').style.display = 'none'; document.getElementById('2409.00750v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.16415">arXiv:2408.16415</a> <span> [<a href="https://arxiv.org/pdf/2408.16415">pdf</a>, <a href="https://arxiv.org/format/2408.16415">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> </div> </div> <p class="title is-5 mathjax"> UAV's Rotor Micro-Doppler Feature Extraction Using Integrated Sensing and Communication Signal: Algorithm Design and Testbed Evaluation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wei%2C+J">Jiachen Wei</a>, <a href="/search/eess?searchtype=author&query=Ma%2C+D">Dingyou Ma</a>, <a href="/search/eess?searchtype=author&query=He%2C+F">Feiyang He</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Q">Qixun Zhang</a>, <a href="/search/eess?searchtype=author&query=Feng%2C+Z">Zhiyong Feng</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+Z">Zhengfeng Liu</a>, <a href="/search/eess?searchtype=author&query=Liang%2C+T">Taohong Liang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.16415v1-abstract-short" style="display: inline;"> With the rapid application of unmanned aerial vehicles (UAVs) in urban areas, the identification and tracking of hovering UAVs have become critical challenges, significantly impacting the safety of aircraft take-off and landing operations. As a promising technology for 6G mobile systems, integrated sensing and communication (ISAC) can be used to detect high-mobility UAVs with a low deployment cost… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.16415v1-abstract-full').style.display = 'inline'; document.getElementById('2408.16415v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.16415v1-abstract-full" style="display: none;"> With the rapid application of unmanned aerial vehicles (UAVs) in urban areas, the identification and tracking of hovering UAVs have become critical challenges, significantly impacting the safety of aircraft take-off and landing operations. As a promising technology for 6G mobile systems, integrated sensing and communication (ISAC) can be used to detect high-mobility UAVs with a low deployment cost. The micro-Doppler signals from UAV rotors can be leveraged to address the detection of low-mobility and hovering UAVs using ISAC signals. However, determining whether the frame structure of the ISAC system can be used to identify UAVs, and how to accurately capture the weak rotor micro-Doppler signals of UAVs in complex environments, remain two challenging problems. This paper first proposes a novel frame structure for UAV micro-Doppler extraction and the representation of UAV micro-Doppler signals within the channel state information (CSI). Furthermore, to address complex environments and the interference caused by UAV body vibrations, the rotor micro-Doppler null space pursuit (rmD-NSP) algorithm and the feature extraction algorithm synchroextracting transform (SET) are designed to effectively separate UAV's rotor micro-Doppler signals and enhance their features in the spectrogram. Finally, both simulation and hardware testbed demonstrate that the proposed rmD-NSP algorithm enables the ISAC base station (BS) to accurately and completely extract UAV's rotor micro-Doppler signals. Within a 0.1s observation period, ISAC BS successfully captures eight rotations of the DJI M300 RTK UAV's rotor in urban environments. Compared to the existing AM-FM NSP and NSP signal decomposition algorithms, the integrity of the rotor micro-Doppler features is improved by 60%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.16415v1-abstract-full').style.display = 'none'; document.getElementById('2408.16415v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.12102">arXiv:2408.12102</a> <span> [<a href="https://arxiv.org/pdf/2408.12102">pdf</a>, <a href="https://arxiv.org/format/2408.12102">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Integrating Audio, Visual, and Semantic Information for Enhanced Multimodal Speaker Diarization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Cheng%2C+L">Luyao Cheng</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+H">Hui Wang</a>, <a href="/search/eess?searchtype=author&query=Zheng%2C+S">Siqi Zheng</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yafeng Chen</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+R">Rongjie Huang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Q">Qinglin Zhang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Q">Qian Chen</a>, <a href="/search/eess?searchtype=author&query=Li%2C+X">Xihao Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.12102v1-abstract-short" style="display: inline;"> Speaker diarization, the process of segmenting an audio stream or transcribed speech content into homogenous partitions based on speaker identity, plays a crucial role in the interpretation and analysis of human speech. Most existing speaker diarization systems rely exclusively on unimodal acoustic information, making the task particularly challenging due to the innate ambiguities of audio signals… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12102v1-abstract-full').style.display = 'inline'; document.getElementById('2408.12102v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.12102v1-abstract-full" style="display: none;"> Speaker diarization, the process of segmenting an audio stream or transcribed speech content into homogenous partitions based on speaker identity, plays a crucial role in the interpretation and analysis of human speech. Most existing speaker diarization systems rely exclusively on unimodal acoustic information, making the task particularly challenging due to the innate ambiguities of audio signals. Recent studies have made tremendous efforts towards audio-visual or audio-semantic modeling to enhance performance. However, even the incorporation of up to two modalities often falls short in addressing the complexities of spontaneous and unstructured conversations. To exploit more meaningful dialogue patterns, we propose a novel multimodal approach that jointly utilizes audio, visual, and semantic cues to enhance speaker diarization. Our method elegantly formulates the multimodal modeling as a constrained optimization problem. First, we build insights into the visual connections among active speakers and the semantic interactions within spoken content, thereby establishing abundant pairwise constraints. Then we introduce a joint pairwise constraint propagation algorithm to cluster speakers based on these visual and semantic constraints. This integration effectively leverages the complementary strengths of different modalities, refining the affinity estimation between individual speaker embeddings. Extensive experiments conducted on multiple multimodal datasets demonstrate that our approach consistently outperforms state-of-the-art speaker diarization methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12102v1-abstract-full').style.display = 'none'; document.getElementById('2408.12102v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.11582">arXiv:2408.11582</a> <span> [<a href="https://arxiv.org/pdf/2408.11582">pdf</a>, <a href="https://arxiv.org/format/2408.11582">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Enhanced Visual SLAM for Collision-free Driving with Lightweight Autonomous Cars </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Lin%2C+Z">Zhihao Lin</a>, <a href="/search/eess?searchtype=author&query=Tian%2C+Z">Zhen Tian</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Q">Qi Zhang</a>, <a href="/search/eess?searchtype=author&query=Zhuang%2C+H">Hanyang Zhuang</a>, <a href="/search/eess?searchtype=author&query=Lan%2C+J">Jianglin Lan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.11582v1-abstract-short" style="display: inline;"> The paper presents a vision-based obstacle avoidance strategy for lightweight self-driving cars that can be run on a CPU-only device using a single RGB-D camera. The method consists of two steps: visual perception and path planning. The visual perception part uses ORBSLAM3 enhanced with optical flow to estimate the car's poses and extract rich texture information from the scene. In the path planni… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.11582v1-abstract-full').style.display = 'inline'; document.getElementById('2408.11582v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.11582v1-abstract-full" style="display: none;"> The paper presents a vision-based obstacle avoidance strategy for lightweight self-driving cars that can be run on a CPU-only device using a single RGB-D camera. The method consists of two steps: visual perception and path planning. The visual perception part uses ORBSLAM3 enhanced with optical flow to estimate the car's poses and extract rich texture information from the scene. In the path planning phase, we employ a method combining a control Lyapunov function and control barrier function in the form of quadratic program (CLF-CBF-QP) together with an obstacle shape reconstruction process (SRP) to plan safe and stable trajectories. To validate the performance and robustness of the proposed method, simulation experiments were conducted with a car in various complex indoor environments using the Gazebo simulation environment. Our method can effectively avoid obstacles in the scenes. The proposed algorithm outperforms benchmark algorithms in achieving more stable and shorter trajectories across multiple simulated scenes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.11582v1-abstract-full').style.display = 'none'; document.getElementById('2408.11582v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages; Submitted to a journal</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.09245">arXiv:2408.09245</a> <span> [<a href="https://arxiv.org/pdf/2408.09245">pdf</a>, <a href="https://arxiv.org/format/2408.09245">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Accelerating Chance-constrained SCED via Scenario Compression </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+Q">Qian Zhang</a>, <a href="/search/eess?searchtype=author&query=Xie%2C+L">Le Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.09245v1-abstract-short" style="display: inline;"> This paper studies some compression methods to accelerate the scenario-based chance-constrained security-constrained economic dispatch (SCED) problem. In particular, we show that by exclusively employing the vertices after convex hull compression, an equivalent solution can be obtained compared to utilizing the entire scenario set. For other compression methods that might relax the original soluti… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.09245v1-abstract-full').style.display = 'inline'; document.getElementById('2408.09245v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.09245v1-abstract-full" style="display: none;"> This paper studies some compression methods to accelerate the scenario-based chance-constrained security-constrained economic dispatch (SCED) problem. In particular, we show that by exclusively employing the vertices after convex hull compression, an equivalent solution can be obtained compared to utilizing the entire scenario set. For other compression methods that might relax the original solution, such as box compression, this paper presents the compression risk validation scheme to assess the risk arising from the sample space. By quantifying the risk associated with compression, decision-makers are empowered to select either solution risk or compression risk as the risk metric, depending on the complexity of specific problems. Numerical examples based on the 118-bus system and synthetic Texas grids compare these two risk metrics. The results also demonstrate the efficiency of compression methods in both problem formulation and solving processes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.09245v1-abstract-full').style.display = 'none'; document.getElementById('2408.09245v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.09132">arXiv:2408.09132</a> <span> [<a href="https://arxiv.org/pdf/2408.09132">pdf</a>, <a href="https://arxiv.org/format/2408.09132">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> RIS-based Over-the-air Diffractional Channel Coding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Hui%2C+Y">Yingzhe Hui</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+S">Shuyi Chen</a>, <a href="/search/eess?searchtype=author&query=Qin%2C+Y">Yifan Qin</a>, <a href="/search/eess?searchtype=author&query=Meng%2C+W">Weixiao Meng</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Q">Qiushi Zhang</a>, <a href="/search/eess?searchtype=author&query=Jin%2C+W">Wei Jin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.09132v1-abstract-short" style="display: inline;"> Reconfigurable Intelligent Surfaces (RIS) are programmable metasurfaces utilizing sub-wavelength meta-atoms and a controller for precise electromagnetic wave manipulation. This work introduces an innovative channel coding scheme, termed RIS-based diffractional channel coding (DCC), which capitalizes on diffraction between two RIS layers for signal-level encoding. Contrary to traditional methods, D… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.09132v1-abstract-full').style.display = 'inline'; document.getElementById('2408.09132v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.09132v1-abstract-full" style="display: none;"> Reconfigurable Intelligent Surfaces (RIS) are programmable metasurfaces utilizing sub-wavelength meta-atoms and a controller for precise electromagnetic wave manipulation. This work introduces an innovative channel coding scheme, termed RIS-based diffractional channel coding (DCC), which capitalizes on diffraction between two RIS layers for signal-level encoding. Contrary to traditional methods, DCC expands signal dimensions through diffraction, presenting a novel countermeasure to channel effects. This paper focuses on the operational principles of DCC, including encoder and decoder designs, and explores its possibilities to construct block and trellis codes, demonstrating its potential as both an alternative and a supplementary conventional coding scheme. Key advantages of DCC include eliminating extra power requirements for encoding, achieving computation at the speed of light, and enabling adjustable code distance, making it a progressive solution for efficient wireless communication, particularly in systems with large-scale data or massive MIMO. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.09132v1-abstract-full').style.display = 'none'; document.getElementById('2408.09132v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages, 6 figures, accepted by IEEE</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.08242">arXiv:2408.08242</a> <span> [<a href="https://arxiv.org/pdf/2408.08242">pdf</a>, <a href="https://arxiv.org/ps/2408.08242">ps</a>, <a href="https://arxiv.org/format/2408.08242">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> A Conflicts-free, Speed-lossless KAN-based Reinforcement Learning Decision System for Interactive Driving in Roundabouts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Lin%2C+Z">Zhihao Lin</a>, <a href="/search/eess?searchtype=author&query=Tian%2C+Z">Zhen Tian</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Q">Qi Zhang</a>, <a href="/search/eess?searchtype=author&query=Ye%2C+Z">Ziyang Ye</a>, <a href="/search/eess?searchtype=author&query=Zhuang%2C+H">Hanyang Zhuang</a>, <a href="/search/eess?searchtype=author&query=Lan%2C+J">Jianglin Lan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.08242v1-abstract-short" style="display: inline;"> Safety and efficiency are crucial for autonomous driving in roundabouts, especially in the context of mixed traffic where autonomous vehicles (AVs) and human-driven vehicles coexist. This paper introduces a learning-based algorithm tailored to foster safe and efficient driving behaviors across varying levels of traffic flows in roundabouts. The proposed algorithm employs a deep Q-learning network… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.08242v1-abstract-full').style.display = 'inline'; document.getElementById('2408.08242v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.08242v1-abstract-full" style="display: none;"> Safety and efficiency are crucial for autonomous driving in roundabouts, especially in the context of mixed traffic where autonomous vehicles (AVs) and human-driven vehicles coexist. This paper introduces a learning-based algorithm tailored to foster safe and efficient driving behaviors across varying levels of traffic flows in roundabouts. The proposed algorithm employs a deep Q-learning network to effectively learn safe and efficient driving strategies in complex multi-vehicle roundabouts. Additionally, a KAN (Kolmogorov-Arnold network) enhances the AVs' ability to learn their surroundings robustly and precisely. An action inspector is integrated to replace dangerous actions to avoid collisions when the AV interacts with the environment, and a route planner is proposed to enhance the driving efficiency and safety of the AVs. Moreover, a model predictive control is adopted to ensure stability and precision of the driving actions. The results show that our proposed system consistently achieves safe and efficient driving whilst maintaining a stable training process, as evidenced by the smooth convergence of the reward function and the low variance in the training curves across various traffic flows. Compared to state-of-the-art benchmarks, the proposed algorithm achieves a lower number of collisions and reduced travel time to destination. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.08242v1-abstract-full').style.display = 'none'; document.getElementById('2408.08242v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages, 12 figures, submitted to an IEEE journal</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.02582">arXiv:2408.02582</a> <span> [<a href="https://arxiv.org/pdf/2408.02582">pdf</a>, <a href="https://arxiv.org/format/2408.02582">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Clustering and Mining Accented Speech for Inclusive and Fair Speech Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Kim%2C+J">Jaeyoung Kim</a>, <a href="/search/eess?searchtype=author&query=Lu%2C+H">Han Lu</a>, <a href="/search/eess?searchtype=author&query=Khorram%2C+S">Soheil Khorram</a>, <a href="/search/eess?searchtype=author&query=Tripathi%2C+A">Anshuman Tripathi</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Q">Qian Zhang</a>, <a href="/search/eess?searchtype=author&query=Sak%2C+H">Hasim Sak</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.02582v1-abstract-short" style="display: inline;"> Modern automatic speech recognition (ASR) systems are typically trained on more than tens of thousands hours of speech data, which is one of the main factors for their great success. However, the distribution of such data is typically biased towards common accents or typical speech patterns. As a result, those systems often poorly perform on atypical accented speech. In this paper, we present acce… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.02582v1-abstract-full').style.display = 'inline'; document.getElementById('2408.02582v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.02582v1-abstract-full" style="display: none;"> Modern automatic speech recognition (ASR) systems are typically trained on more than tens of thousands hours of speech data, which is one of the main factors for their great success. However, the distribution of such data is typically biased towards common accents or typical speech patterns. As a result, those systems often poorly perform on atypical accented speech. In this paper, we present accent clustering and mining schemes for fair speech recognition systems which can perform equally well on under-represented accented speech. For accent recognition, we applied three schemes to overcome limited size of supervised accent data: supervised or unsupervised pre-training, distributionally robust optimization (DRO) and unsupervised clustering. Three schemes can significantly improve the accent recognition model especially for unbalanced and small accented speech. Fine-tuning ASR on the mined Indian accent speech using the proposed supervised or unsupervised clustering schemes showed 10.0% and 5.3% relative improvements compared to fine-tuning on the randomly sampled speech, respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.02582v1-abstract-full').style.display = 'none'; document.getElementById('2408.02582v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.00365">arXiv:2408.00365</a> <span> [<a href="https://arxiv.org/pdf/2408.00365">pdf</a>, <a href="https://arxiv.org/format/2408.00365">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Multimodal Fusion and Coherence Modeling for Video Topic Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yu%2C+H">Hai Yu</a>, <a href="/search/eess?searchtype=author&query=Deng%2C+C">Chong Deng</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Q">Qinglin Zhang</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+J">Jiaqing Liu</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Q">Qian Chen</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+W">Wen Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.00365v1-abstract-short" style="display: inline;"> The video topic segmentation (VTS) task segments videos into intelligible, non-overlapping topics, facilitating efficient comprehension of video content and quick access to specific content. VTS is also critical to various downstream video understanding tasks. Traditional VTS methods using shallow features or unsupervised approaches struggle to accurately discern the nuances of topical transitions… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.00365v1-abstract-full').style.display = 'inline'; document.getElementById('2408.00365v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.00365v1-abstract-full" style="display: none;"> The video topic segmentation (VTS) task segments videos into intelligible, non-overlapping topics, facilitating efficient comprehension of video content and quick access to specific content. VTS is also critical to various downstream video understanding tasks. Traditional VTS methods using shallow features or unsupervised approaches struggle to accurately discern the nuances of topical transitions. Recently, supervised approaches have achieved superior performance on video action or scene segmentation over unsupervised approaches. In this work, we improve supervised VTS by thoroughly exploring multimodal fusion and multimodal coherence modeling. Specifically, (1) we enhance multimodal fusion by exploring different architectures using cross-attention and mixture of experts. (2) To generally strengthen multimodality alignment and fusion, we pre-train and fine-tune the model with multimodal contrastive learning. (3) We propose a new pre-training task tailored for the VTS task, and a novel fine-tuning task for enhancing multimodal coherence modeling for VTS. We evaluate the proposed approaches on educational videos, in the form of lectures, due to the vital role of topic segmentation of educational videos in boosting learning experiences. Additionally, we introduce a large-scale Chinese lecture video dataset to augment the existing English corpus, promoting further research in VTS. Experiments on both English and Chinese lecture datasets demonstrate that our model achieves superior VTS performance compared to competitive unsupervised and supervised baselines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.00365v1-abstract-full').style.display = 'none'; document.getElementById('2408.00365v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.21646">arXiv:2407.21646</a> <span> [<a href="https://arxiv.org/pdf/2407.21646">pdf</a>, <a href="https://arxiv.org/format/2407.21646">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Towards Achieving Human Parity on End-to-end Simultaneous Speech Translation via LLM Agent </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Cheng%2C+S">Shanbo Cheng</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+Z">Zhichao Huang</a>, <a href="/search/eess?searchtype=author&query=Ko%2C+T">Tom Ko</a>, <a href="/search/eess?searchtype=author&query=Li%2C+H">Hang Li</a>, <a href="/search/eess?searchtype=author&query=Peng%2C+N">Ningxin Peng</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+L">Lu Xu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Q">Qini Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.21646v2-abstract-short" style="display: inline;"> In this paper, we present Cross Language Agent -- Simultaneous Interpretation, CLASI, a high-quality and human-like Simultaneous Speech Translation (SiST) System. Inspired by professional human interpreters, we utilize a novel data-driven read-write strategy to balance the translation quality and latency. To address the challenge of translating in-domain terminologies, CLASI employs a multi-modal… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.21646v2-abstract-full').style.display = 'inline'; document.getElementById('2407.21646v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.21646v2-abstract-full" style="display: none;"> In this paper, we present Cross Language Agent -- Simultaneous Interpretation, CLASI, a high-quality and human-like Simultaneous Speech Translation (SiST) System. Inspired by professional human interpreters, we utilize a novel data-driven read-write strategy to balance the translation quality and latency. To address the challenge of translating in-domain terminologies, CLASI employs a multi-modal retrieving module to obtain relevant information to augment the translation. Supported by LLMs, our approach can generate error-tolerated translation by considering the input audio, historical context, and retrieved information. Experimental results show that our system outperforms other systems by significant margins. Aligned with professional human interpreters, we evaluate CLASI with a better human evaluation metric, valid information proportion (VIP), which measures the amount of information that can be successfully conveyed to the listeners. In the real-world scenarios, where the speeches are often disfluent, informal, and unclear, CLASI achieves VIP of 81.3% and 78.0% for Chinese-to-English and English-to-Chinese translation directions, respectively. In contrast, state-of-the-art commercial or open-source systems only achieve 35.4% and 41.6%. On the extremely hard dataset, where other systems achieve under 13% VIP, CLASI can still achieve 70% VIP. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.21646v2-abstract-full').style.display = 'none'; document.getElementById('2407.21646v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Authors are listed in alphabetical order by last name. Demonstrations and human-annotated test sets are available at https://byteresearchcla.github.io/clasi</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.14140">arXiv:2407.14140</a> <span> [<a href="https://arxiv.org/pdf/2407.14140">pdf</a>, <a href="https://arxiv.org/format/2407.14140">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> A Secure and Efficient Distributed Semantic Communication System for Heterogeneous Internet of Things Devices </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zeng%2C+W">Weihao Zeng</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+X">Xinyu Xu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Q">Qianyun Zhang</a>, <a href="/search/eess?searchtype=author&query=Shi%2C+J">Jiting Shi</a>, <a href="/search/eess?searchtype=author&query=Qin%2C+Z">Zhijin Qin</a>, <a href="/search/eess?searchtype=author&query=Guan%2C+Z">Zhenyu Guan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.14140v1-abstract-short" style="display: inline;"> Semantic communications have emerged as a promising solution to address the challenge of efficient communication in rapidly evolving and increasingly complex Internet of Things (IoT) networks. However, protecting the security of semantic communication systems within the distributed and heterogeneous IoT networks is critical issues that need to be addressed. We develop a secure and efficient distri… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.14140v1-abstract-full').style.display = 'inline'; document.getElementById('2407.14140v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.14140v1-abstract-full" style="display: none;"> Semantic communications have emerged as a promising solution to address the challenge of efficient communication in rapidly evolving and increasingly complex Internet of Things (IoT) networks. However, protecting the security of semantic communication systems within the distributed and heterogeneous IoT networks is critical issues that need to be addressed. We develop a secure and efficient distributed semantic communication system in IoT scenarios, focusing on three aspects: secure system maintenance, efficient system update, and privacy-preserving system usage. Firstly, we propose a blockchain-based interaction framework that ensures the integrity, authentication, and availability of interactions among IoT devices to securely maintain system. This framework includes a novel digital signature verification mechanism designed for semantic communications, enabling secure and efficient interactions with semantic communications. Secondly, to improve the efficiency of interactions, we develop a flexible semantic communication scheme that leverages compressed semantic knowledge bases. This scheme reduces the data exchange required for system update and is adapt to dynamic task requirements and the diversity of device capabilities. Thirdly, we exploit the integration of differential privacy into semantic communications. We analyze the implementation of differential privacy taking into account the lossy nature of semantic communications and wireless channel distortions. An joint model-channel noise mechanism is introduced to achieve differential privacy preservation in semantic communications without compromising the system's functionality. Experiments show that the system is able to achieve integrity, availability, efficiency and the preservation of privacy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.14140v1-abstract-full').style.display = 'none'; document.getElementById('2407.14140v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.11093">arXiv:2407.11093</a> <span> [<a href="https://arxiv.org/pdf/2407.11093">pdf</a>, <a href="https://arxiv.org/format/2407.11093">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optics">physics.optics</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1016/j.optlastec.2024.110971.">10.1016/j.optlastec.2024.110971. <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> A neural network for forward and inverse nonlinear Fourier transforms for fiber optic communication </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+W+Q">Wen Qi Zhang</a>, <a href="/search/eess?searchtype=author&query=Chan%2C+T+H">Terence H. Chan</a>, <a href="/search/eess?searchtype=author&query=V.%2C+S+A">Shahraam Afshar V.</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.11093v1-abstract-short" style="display: inline;"> We propose a neural network for both forward and inverse continuous nonlinear Fourier transforms, NFT and INFT respectively. We demonstrate the network's capability to perform NFT and INFT for a random mix of NFDM-QAM signals. The network transformations (NFT and INFT) exhibit true characteristics of these transformations; they are significantly different for low and high-power input pulses. The n… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.11093v1-abstract-full').style.display = 'inline'; document.getElementById('2407.11093v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.11093v1-abstract-full" style="display: none;"> We propose a neural network for both forward and inverse continuous nonlinear Fourier transforms, NFT and INFT respectively. We demonstrate the network's capability to perform NFT and INFT for a random mix of NFDM-QAM signals. The network transformations (NFT and INFT) exhibit true characteristics of these transformations; they are significantly different for low and high-power input pulses. The network shows adequate accuracy with an RMSE of 5e-3 for forward and 3e-2 for inverse transforms. We further show that the trained network can be used to perform general nonlinear Fourier transforms on arbitrary pulses beyond the training pulse types. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.11093v1-abstract-full').style.display = 'none'; document.getElementById('2407.11093v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Optics & Laser Technology, vol. 176, p. 110971, Sep. 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.09428">arXiv:2407.09428</a> <span> [<a href="https://arxiv.org/pdf/2407.09428">pdf</a>, <a href="https://arxiv.org/format/2407.09428">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> The Transmission Value of Energy Storage and Fundamental Limitations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+Q">Qian Zhang</a>, <a href="/search/eess?searchtype=author&query=Kumar%2C+P+R">P. R. Kumar</a>, <a href="/search/eess?searchtype=author&query=Xie%2C+L">Le Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.09428v1-abstract-short" style="display: inline;"> This study addresses the transmission value of energy storage in electric grids. The inherent connection between storage and transmission infrastructure is captured from a "cumulative energy" perspective, which enables the reformulating of the conventional optimization problem by employing line power flow as the decision variable. The study also establishes the theoretical limitations of both stor… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.09428v1-abstract-full').style.display = 'inline'; document.getElementById('2407.09428v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.09428v1-abstract-full" style="display: none;"> This study addresses the transmission value of energy storage in electric grids. The inherent connection between storage and transmission infrastructure is captured from a "cumulative energy" perspective, which enables the reformulating of the conventional optimization problem by employing line power flow as the decision variable. The study also establishes the theoretical limitations of both storage and transmission lines that can be replaced by each other, providing explicit closed-form expressions for the minimum capacity needed. As a key departure from conventional practice in which transmission lines are designed according to the peak power delivery needs, with sufficient storage capacity, the transmission line capacity can be designed based on the average power delivery needs. The models of this paper only rely on a few basic assumptions, paving the way for understanding future storage as a transmission asset market design. Numerical experiments based on 2-bus, modified RTS 24-bus, RTS-GMLC, and Texas synthetic power systems illustrate the results. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.09428v1-abstract-full').style.display = 'none'; document.getElementById('2407.09428v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.09254">arXiv:2407.09254</a> <span> [<a href="https://arxiv.org/pdf/2407.09254">pdf</a>, <a href="https://arxiv.org/format/2407.09254">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Power Optimization and Deep Learning for Channel Estimation of Active IRS-Aided IoT </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yan Wang</a>, <a href="/search/eess?searchtype=author&query=Shu%2C+F">Feng Shu</a>, <a href="/search/eess?searchtype=author&query=Dong%2C+R">Rongen Dong</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+W">Wei Gao</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Q">Qi Zhang</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+J">Jiajia Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.09254v2-abstract-short" style="display: inline;"> In this paper, channel estimation of an active intelligent reflecting surface (IRS) aided uplink Internet of Things (IoT) network is investigated. Firstly, the least square (LS) estimators for the direct channel and the cascaded channel are presented, respectively. The corresponding mean square errors (MSE) of channel estimators are derived. Subsequently, in order to evaluate the influence of adju… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.09254v2-abstract-full').style.display = 'inline'; document.getElementById('2407.09254v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.09254v2-abstract-full" style="display: none;"> In this paper, channel estimation of an active intelligent reflecting surface (IRS) aided uplink Internet of Things (IoT) network is investigated. Firstly, the least square (LS) estimators for the direct channel and the cascaded channel are presented, respectively. The corresponding mean square errors (MSE) of channel estimators are derived. Subsequently, in order to evaluate the influence of adjusting the transmit power at the IoT devices or the reflected power at the active IRS on Sum-MSE performance, two situations are considered. In the first case, under the total power sum constraint of the IoT devices and active IRS, the closed-form expression of the optimal power allocation factor is derived. In the second case, when the transmit power at the IoT devices is fixed, there exists an optimal reflective power at active IRS. To further improve the estimation performance, the convolutional neural network (CNN)-based direct channel estimation (CDCE) algorithm and the CNN-based cascaded channel estimation (CCCE) algorithm are designed. Finally, simulation results demonstrate the existence of an optimal power allocation strategy that minimizes the Sum-MSE, and further validate the superiority of the proposed CDCE / CCCE algorithms over their respective traditional LS and minimum mean square error (MMSE) baselines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.09254v2-abstract-full').style.display = 'none'; document.getElementById('2407.09254v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.07337">arXiv:2407.07337</a> <span> [<a href="https://arxiv.org/pdf/2407.07337">pdf</a>, <a href="https://arxiv.org/format/2407.07337">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> In-Orbit Processing or Not? Sunlight-Aware Task Scheduling for Energy-Efficient Space Edge Computing Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Liu%2C+W">Weisen Liu</a>, <a href="/search/eess?searchtype=author&query=Lai%2C+Z">Zeqi Lai</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+Q">Qian Wu</a>, <a href="/search/eess?searchtype=author&query=Li%2C+H">Hewu Li</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Q">Qi Zhang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Z">Zonglun Li</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yuanjie Li</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+J">Jun Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.07337v1-abstract-short" style="display: inline;"> With the rapid evolution of space-borne capabilities, space edge computing (SEC) is becoming a new computation paradigm for future integrated space and terrestrial networks. Satellite edges adopt advanced on-board hardware, which not only enables new opportunities to perform complex intelligent tasks in orbit, but also involves new challenges due to the additional energy consumption in power-const… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.07337v1-abstract-full').style.display = 'inline'; document.getElementById('2407.07337v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.07337v1-abstract-full" style="display: none;"> With the rapid evolution of space-borne capabilities, space edge computing (SEC) is becoming a new computation paradigm for future integrated space and terrestrial networks. Satellite edges adopt advanced on-board hardware, which not only enables new opportunities to perform complex intelligent tasks in orbit, but also involves new challenges due to the additional energy consumption in power-constrained space environment. In this paper, we present PHOENIX, an energy-efficient task scheduling framework for emerging SEC networks. PHOENIX exploits a key insight that in the SEC network, there always exist a number of sunlit edges which are illuminated during the entire orbital period and have sufficient energy supplement from the sun. PHOENIX accomplishes energy-efficient in-orbit computing by judiciously offloading space tasks to "sunlight-sufficient" edges or to the ground. Specifically, PHOENIX first formulates the SEC battery energy optimizing (SBEO) problem which aims at minimizing the average battery energy consumption while satisfying various task completion constraints. Then PHOENIX incorporates a sunlight-aware scheduling mechanism to solve the SBEO problem and schedule SEC tasks efficiently. Finally, we implement a PHOENIX prototype and build an SEC testbed. Extensive data-driven evaluations demonstrate that as compared to other state-of-the-art solutions, PHOENIX can effectively reduce up to 54.8% SEC battery energy consumption and prolong battery lifetime to 2.9$\times$ while still completing tasks on time. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.07337v1-abstract-full').style.display = 'none'; document.getElementById('2407.07337v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by IEEE INFOCOM 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.06458">arXiv:2407.06458</a> <span> [<a href="https://arxiv.org/pdf/2407.06458">pdf</a>, <a href="https://arxiv.org/format/2407.06458">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1038/s41598-023-44714-2">10.1038/s41598-023-44714-2 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Soli-enabled Noncontact Heart Rate Detection for Sleep and Meditation Tracking </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Xu%2C+L">Luzhou Xu</a>, <a href="/search/eess?searchtype=author&query=Lien%2C+J">Jaime Lien</a>, <a href="/search/eess?searchtype=author&query=Li%2C+H">Haiguang Li</a>, <a href="/search/eess?searchtype=author&query=Gillian%2C+N">Nicholas Gillian</a>, <a href="/search/eess?searchtype=author&query=Nongpiur%2C+R">Rajeev Nongpiur</a>, <a href="/search/eess?searchtype=author&query=Li%2C+J">Jihan Li</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Q">Qian Zhang</a>, <a href="/search/eess?searchtype=author&query=Cui%2C+J">Jian Cui</a>, <a href="/search/eess?searchtype=author&query=Jorgensen%2C+D">David Jorgensen</a>, <a href="/search/eess?searchtype=author&query=Bernstein%2C+A">Adam Bernstein</a>, <a href="/search/eess?searchtype=author&query=Bedal%2C+L">Lauren Bedal</a>, <a href="/search/eess?searchtype=author&query=Hayashi%2C+E">Eiji Hayashi</a>, <a href="/search/eess?searchtype=author&query=Yamanaka%2C+J">Jin Yamanaka</a>, <a href="/search/eess?searchtype=author&query=Lee%2C+A">Alex Lee</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+J">Jian Wang</a>, <a href="/search/eess?searchtype=author&query=Shin%2C+D">D Shin</a>, <a href="/search/eess?searchtype=author&query=Poupyrev%2C+I">Ivan Poupyrev</a>, <a href="/search/eess?searchtype=author&query=Thormundsson%2C+T">Trausti Thormundsson</a>, <a href="/search/eess?searchtype=author&query=Pathak%2C+A">Anupam Pathak</a>, <a href="/search/eess?searchtype=author&query=Patel%2C+S">Shwetak Patel</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.06458v1-abstract-short" style="display: inline;"> Heart rate (HR) is a crucial physiological signal that can be used to monitor health and fitness. Traditional methods for measuring HR require wearable devices, which can be inconvenient or uncomfortable, especially during sleep and meditation. Noncontact HR detection methods employing microwave radar can be a promising alternative. However, the existing approaches in the literature usually use hi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.06458v1-abstract-full').style.display = 'inline'; document.getElementById('2407.06458v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.06458v1-abstract-full" style="display: none;"> Heart rate (HR) is a crucial physiological signal that can be used to monitor health and fitness. Traditional methods for measuring HR require wearable devices, which can be inconvenient or uncomfortable, especially during sleep and meditation. Noncontact HR detection methods employing microwave radar can be a promising alternative. However, the existing approaches in the literature usually use high-gain antennas and require the sensor to face the user's chest or back, making them difficult to integrate into a portable device and unsuitable for sleep and meditation tracking applications. This study presents a novel approach for noncontact HR detection using a miniaturized Soli radar chip embedded in a portable device (Google Nest Hub). The chip has a $6.5 \mbox{ mm} \times 5 \mbox{ mm} \times 0.9 \mbox{ mm}$ dimension and can be easily integrated into various devices. The proposed approach utilizes advanced signal processing and machine learning techniques to extract HRs from radar signals. The approach is validated on a sleep dataset (62 users, 498 hours) and a meditation dataset (114 users, 1131 minutes). The approach achieves a mean absolute error (MAE) of $1.69$ bpm and a mean absolute percentage error (MAPE) of $2.67\%$ on the sleep dataset. On the meditation dataset, the approach achieves an MAE of $1.05$ bpm and a MAPE of $1.56\%$. The recall rates for the two datasets are $88.53\%$ and $98.16\%$, respectively. This study represents the first application of the noncontact HR detection technology to sleep and meditation tracking, offering a promising alternative to wearable devices for HR monitoring during sleep and meditation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.06458v1-abstract-full').style.display = 'none'; document.getElementById('2407.06458v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Sci Rep 13, 18008 (2023) </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.04051">arXiv:2407.04051</a> <span> [<a href="https://arxiv.org/pdf/2407.04051">pdf</a>, <a href="https://arxiv.org/format/2407.04051">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> FunAudioLLM: Voice Understanding and Generation Foundation Models for Natural Interaction Between Humans and LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=An%2C+K">Keyu An</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Q">Qian Chen</a>, <a href="/search/eess?searchtype=author&query=Deng%2C+C">Chong Deng</a>, <a href="/search/eess?searchtype=author&query=Du%2C+Z">Zhihao Du</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+C">Changfeng Gao</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+Z">Zhifu Gao</a>, <a href="/search/eess?searchtype=author&query=Gu%2C+Y">Yue Gu</a>, <a href="/search/eess?searchtype=author&query=He%2C+T">Ting He</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+H">Hangrui Hu</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+K">Kai Hu</a>, <a href="/search/eess?searchtype=author&query=Ji%2C+S">Shengpeng Ji</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yabin Li</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Z">Zerui Li</a>, <a href="/search/eess?searchtype=author&query=Lu%2C+H">Heng Lu</a>, <a href="/search/eess?searchtype=author&query=Luo%2C+H">Haoneng Luo</a>, <a href="/search/eess?searchtype=author&query=Lv%2C+X">Xiang Lv</a>, <a href="/search/eess?searchtype=author&query=Ma%2C+B">Bin Ma</a>, <a href="/search/eess?searchtype=author&query=Ma%2C+Z">Ziyang Ma</a>, <a href="/search/eess?searchtype=author&query=Ni%2C+C">Chongjia Ni</a>, <a href="/search/eess?searchtype=author&query=Song%2C+C">Changhe Song</a>, <a href="/search/eess?searchtype=author&query=Shi%2C+J">Jiaqi Shi</a>, <a href="/search/eess?searchtype=author&query=Shi%2C+X">Xian Shi</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+H">Hao Wang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+W">Wen Wang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yuxuan Wang</a> , et al. (8 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.04051v3-abstract-short" style="display: inline;"> This report introduces FunAudioLLM, a model family designed to enhance natural voice interactions between humans and large language models (LLMs). At its core are two innovative models: SenseVoice, which handles multilingual speech recognition, emotion recognition, and audio event detection; and CosyVoice, which facilitates natural speech generation with control over multiple languages, timbre, sp… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.04051v3-abstract-full').style.display = 'inline'; document.getElementById('2407.04051v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.04051v3-abstract-full" style="display: none;"> This report introduces FunAudioLLM, a model family designed to enhance natural voice interactions between humans and large language models (LLMs). At its core are two innovative models: SenseVoice, which handles multilingual speech recognition, emotion recognition, and audio event detection; and CosyVoice, which facilitates natural speech generation with control over multiple languages, timbre, speaking style, and speaker identity. SenseVoice-Small delivers exceptionally low-latency ASR for 5 languages, and SenseVoice-Large supports high-precision ASR for over 50 languages, while CosyVoice excels in multi-lingual voice generation, zero-shot in-context learning, cross-lingual voice cloning, and instruction-following capabilities. The models related to SenseVoice and CosyVoice have been open-sourced on Modelscope and Huggingface, along with the corresponding training, inference, and fine-tuning codes released on GitHub. By integrating these models with LLMs, FunAudioLLM enables applications such as speech-to-speech translation, emotional voice chat, interactive podcasts, and expressive audiobook narration, thereby pushing the boundaries of voice interaction technology. Demos are available at https://fun-audio-llm.github.io, and the code can be accessed at https://github.com/FunAudioLLM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.04051v3-abstract-full').style.display = 'none'; document.getElementById('2407.04051v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Work in progress. Authors are listed in alphabetical order by family name</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.00042">arXiv:2407.00042</a> <span> [<a href="https://arxiv.org/pdf/2407.00042">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Neurons and Cognition">q-bio.NC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Module control of network analysis in psychopathology </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Pan%2C+C">Chunyu Pan</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Q">Quan Zhang</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+Y">Yue Zhu</a>, <a href="/search/eess?searchtype=author&query=Kong%2C+S">Shengzhou Kong</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+J">Juan Liu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+C">Changsheng Zhang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+F">Fei Wang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+X">Xizhe Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.00042v1-abstract-short" style="display: inline;"> The network approach to characterizing psychopathology departs from traditional latent categorical and dimensional approaches. Causal interplay among symptoms contributed to dynamic psychopathology system. Therefore, analyzing the symptom clusters is critical for understanding mental disorders. Furthermore, despite extensive research studying the topological features of symptom networks, the contr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.00042v1-abstract-full').style.display = 'inline'; document.getElementById('2407.00042v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.00042v1-abstract-full" style="display: none;"> The network approach to characterizing psychopathology departs from traditional latent categorical and dimensional approaches. Causal interplay among symptoms contributed to dynamic psychopathology system. Therefore, analyzing the symptom clusters is critical for understanding mental disorders. Furthermore, despite extensive research studying the topological features of symptom networks, the control relationships between symptoms remain largely unclear. Here, we present a novel systematizing concept, module control, to analyze the control principle of the symptom network at a module level. We introduce Module Control Network (MCN) to identify key modules that regulate the network's behavior. By applying our approach to a multivariate psychological dataset, we discover that non-emotional modules, such as sleep-related and stress-related modules, are the primary controlling modules in the symptom network. Our findings indicate that module control can expose central symptom cluster governing psychopathology network, offering novel insights into the underlying mechanisms of mental disorders and individualized approach to psychological interventions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.00042v1-abstract-full').style.display = 'none'; document.getElementById('2407.00042v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.18054">arXiv:2406.18054</a> <span> [<a href="https://arxiv.org/pdf/2406.18054">pdf</a>, <a href="https://arxiv.org/format/2406.18054">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Leveraging Pre-trained Models for FF-to-FFPE Histopathological Image Translation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+Q">Qilai Zhang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+J">Jiawen Li</a>, <a href="/search/eess?searchtype=author&query=Liao%2C+P">Peiran Liao</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+J">Jiali Hu</a>, <a href="/search/eess?searchtype=author&query=Guan%2C+T">Tian Guan</a>, <a href="/search/eess?searchtype=author&query=Han%2C+A">Anjia Han</a>, <a href="/search/eess?searchtype=author&query=He%2C+Y">Yonghong He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.18054v3-abstract-short" style="display: inline;"> The two primary types of Hematoxylin and Eosin (H&E) slides in histopathology are Formalin-Fixed Paraffin-Embedded (FFPE) and Fresh Frozen (FF). FFPE slides offer high quality histopathological images but require a labor-intensive acquisition process. In contrast, FF slides can be prepared quickly, but the image quality is relatively poor. Our task is to translate FF images into FFPE style, thereb… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.18054v3-abstract-full').style.display = 'inline'; document.getElementById('2406.18054v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.18054v3-abstract-full" style="display: none;"> The two primary types of Hematoxylin and Eosin (H&E) slides in histopathology are Formalin-Fixed Paraffin-Embedded (FFPE) and Fresh Frozen (FF). FFPE slides offer high quality histopathological images but require a labor-intensive acquisition process. In contrast, FF slides can be prepared quickly, but the image quality is relatively poor. Our task is to translate FF images into FFPE style, thereby improving the image quality for diagnostic purposes. In this paper, we propose Diffusion-FFPE, a method for FF-to-FFPE histopathological image translation using a pre-trained diffusion model. Specifically, we utilize a one-step diffusion model as the generator, which we fine-tune using LoRA adapters within an adversarial learning framework. To enable the model to effectively capture both global structural patterns and local details, we introduce a multi-scale feature fusion module that leverages two VAE encoders to extract features at different image resolutions, performing feature fusion before inputting them into the UNet. Additionally, a pre-trained vision-language model for histopathology serves as the backbone for the discriminator, enhancing model performance. Our FF-to-FFPE translation experiments on the TCGA-NSCLC dataset demonstrate that the proposed approach outperforms existing methods. The code and models are released at https://github.com/QilaiZhang/Diffusion-FFPE. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.18054v3-abstract-full').style.display = 'none'; document.getElementById('2406.18054v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at IEEE BIBM 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.17976">arXiv:2406.17976</a> <span> [<a href="https://arxiv.org/pdf/2406.17976">pdf</a>, <a href="https://arxiv.org/format/2406.17976">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1038/s41558-024-02092-1">10.1038/s41558-024-02092-1 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> The Role of Electric Grid Research in Addressing Climate Change </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Xie%2C+L">Le Xie</a>, <a href="/search/eess?searchtype=author&query=Majumder%2C+S">Subir Majumder</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+T">Tong Huang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Q">Qian Zhang</a>, <a href="/search/eess?searchtype=author&query=Chang%2C+P">Ping Chang</a>, <a href="/search/eess?searchtype=author&query=Hill%2C+D+J">David J. Hill</a>, <a href="/search/eess?searchtype=author&query=Shahidehpour%2C+M">Mohammad Shahidehpour</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.17976v2-abstract-short" style="display: inline;"> Addressing the urgency of climate change necessitates a coordinated and inclusive effort from all relevant stakeholders. Critical to this effort is the modeling, analysis, control, and integration of technological innovations within the electric energy system, which plays a crucial role in scaling up climate change solutions. This perspective article presents a set of research challenges and oppor… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.17976v2-abstract-full').style.display = 'inline'; document.getElementById('2406.17976v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.17976v2-abstract-full" style="display: none;"> Addressing the urgency of climate change necessitates a coordinated and inclusive effort from all relevant stakeholders. Critical to this effort is the modeling, analysis, control, and integration of technological innovations within the electric energy system, which plays a crucial role in scaling up climate change solutions. This perspective article presents a set of research challenges and opportunities in the area of electric power systems that would be crucial in accelerating Gigaton-level decarbonization. Furthermore, it highlights institutional challenges associated with developing market mechanisms and regulatory architectures, ensuring that incentives are aligned for stakeholders to effectively implement the technological solutions on a large scale. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.17976v2-abstract-full').style.display = 'none'; document.getElementById('2406.17976v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages, 2 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Nat. Clim. Chang. (2024) </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.12236">arXiv:2406.12236</a> <span> [<a href="https://arxiv.org/pdf/2406.12236">pdf</a>, <a href="https://arxiv.org/format/2406.12236">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Binaural Selective Attention Model for Target Speaker Extraction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Meng%2C+H">Hanyu Meng</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Q">Qiquan Zhang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+X">Xiangyu Zhang</a>, <a href="/search/eess?searchtype=author&query=Sethu%2C+V">Vidhyasaharan Sethu</a>, <a href="/search/eess?searchtype=author&query=Ambikairajah%2C+E">Eliathamby Ambikairajah</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.12236v1-abstract-short" style="display: inline;"> The remarkable ability of humans to selectively focus on a target speaker in cocktail party scenarios is facilitated by binaural audio processing. In this paper, we present a binaural time-domain Target Speaker Extraction model based on the Filter-and-Sum Network (FaSNet). Inspired by human selective hearing, our proposed model introduces target speaker embedding into separators using a multi-head… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.12236v1-abstract-full').style.display = 'inline'; document.getElementById('2406.12236v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.12236v1-abstract-full" style="display: none;"> The remarkable ability of humans to selectively focus on a target speaker in cocktail party scenarios is facilitated by binaural audio processing. In this paper, we present a binaural time-domain Target Speaker Extraction model based on the Filter-and-Sum Network (FaSNet). Inspired by human selective hearing, our proposed model introduces target speaker embedding into separators using a multi-head attention-based selective attention block. We also compared two binaural interaction approaches -- the cosine similarity of time-domain signals and inter-channel correlation in learned spectral representations. Our experimental results show that our proposed model outperforms monaural configurations and state-of-the-art multi-channel target speaker extraction models, achieving best-in-class performance with 18.52 dB SI-SDR, 19.12 dB SDR, and 3.05 PESQ scores under anechoic two-speaker test configurations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.12236v1-abstract-full').style.display = 'none'; document.getElementById('2406.12236v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by INTERSPEECH2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.11401">arXiv:2406.11401</a> <span> [<a href="https://arxiv.org/pdf/2406.11401">pdf</a>, <a href="https://arxiv.org/format/2406.11401">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> An Exploration of Length Generalization in Transformer-Based Speech Enhancement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+Q">Qiquan Zhang</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+H">Hongxu Zhu</a>, <a href="/search/eess?searchtype=author&query=Qian%2C+X">Xinyuan Qian</a>, <a href="/search/eess?searchtype=author&query=Ambikairajah%2C+E">Eliathamby Ambikairajah</a>, <a href="/search/eess?searchtype=author&query=Li%2C+H">Haizhou Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.11401v1-abstract-short" style="display: inline;"> The use of Transformer architectures has facilitated remarkable progress in speech enhancement. Training Transformers using substantially long speech utterances is often infeasible as self-attention suffers from quadratic complexity. It is a critical and unexplored challenge for a Transformer-based speech enhancement model to learn from short speech utterances and generalize to longer ones. In thi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.11401v1-abstract-full').style.display = 'inline'; document.getElementById('2406.11401v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.11401v1-abstract-full" style="display: none;"> The use of Transformer architectures has facilitated remarkable progress in speech enhancement. Training Transformers using substantially long speech utterances is often infeasible as self-attention suffers from quadratic complexity. It is a critical and unexplored challenge for a Transformer-based speech enhancement model to learn from short speech utterances and generalize to longer ones. In this paper, we conduct comprehensive experiments to explore the length generalization problem in speech enhancement with Transformer. Our findings first establish that position embedding provides an effective instrument to alleviate the impact of utterance length on Transformer-based speech enhancement. Specifically, we explore four different position embedding schemes to enable length generalization. The results confirm the superiority of relative position embeddings (RPEs) over absolute PE (APEs) in length generalization. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.11401v1-abstract-full').style.display = 'none'; document.getElementById('2406.11401v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by INTERSPEECH 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.09317">arXiv:2406.09317</a> <span> [<a href="https://arxiv.org/pdf/2406.09317">pdf</a>, <a href="https://arxiv.org/format/2406.09317">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Common and Rare Fundus Diseases Identification Using Vision-Language Foundation Model with Knowledge of Over 400 Diseases </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+M">Meng Wang</a>, <a href="/search/eess?searchtype=author&query=Lin%2C+T">Tian Lin</a>, <a href="/search/eess?searchtype=author&query=Lin%2C+A">Aidi Lin</a>, <a href="/search/eess?searchtype=author&query=Yu%2C+K">Kai Yu</a>, <a href="/search/eess?searchtype=author&query=Peng%2C+Y">Yuanyuan Peng</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+L">Lianyu Wang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+C">Cheng Chen</a>, <a href="/search/eess?searchtype=author&query=Zou%2C+K">Ke Zou</a>, <a href="/search/eess?searchtype=author&query=Liang%2C+H">Huiyu Liang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+M">Man Chen</a>, <a href="/search/eess?searchtype=author&query=Yao%2C+X">Xue Yao</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+M">Meiqin Zhang</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+B">Binwei Huang</a>, <a href="/search/eess?searchtype=author&query=Zheng%2C+C">Chaoxin Zheng</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+P">Peixin Zhang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+W">Wei Chen</a>, <a href="/search/eess?searchtype=author&query=Luo%2C+Y">Yilong Luo</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yifan Chen</a>, <a href="/search/eess?searchtype=author&query=Xia%2C+H">Honghe Xia</a>, <a href="/search/eess?searchtype=author&query=Shi%2C+T">Tingkun Shi</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Q">Qi Zhang</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+J">Jinming Guo</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+X">Xiaolin Chen</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+J">Jingcheng Wang</a>, <a href="/search/eess?searchtype=author&query=Tham%2C+Y+C">Yih Chung Tham</a> , et al. (24 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.09317v2-abstract-short" style="display: inline;"> Previous foundation models for retinal images were pre-trained with limited disease categories and knowledge base. Here we introduce RetiZero, a vision-language foundation model that leverages knowledge from over 400 fundus diseases. To RetiZero's pre-training, we compiled 341,896 fundus images paired with text descriptions, sourced from public datasets, ophthalmic literature, and online resources… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.09317v2-abstract-full').style.display = 'inline'; document.getElementById('2406.09317v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.09317v2-abstract-full" style="display: none;"> Previous foundation models for retinal images were pre-trained with limited disease categories and knowledge base. Here we introduce RetiZero, a vision-language foundation model that leverages knowledge from over 400 fundus diseases. To RetiZero's pre-training, we compiled 341,896 fundus images paired with text descriptions, sourced from public datasets, ophthalmic literature, and online resources, encompassing a diverse range of diseases across multiple ethnicities and countries. RetiZero exhibits superior performance in several downstream tasks, including zero-shot disease recognition, image-to-image retrieval, and internal- and cross-domain disease identification. In zero-shot scenarios, RetiZero achieves Top5 accuracy scores of 0.8430 for 15 fundus diseases and 0.7561 for 52 fundus diseases. For image retrieval, it achieves Top5 scores of 0.9500 and 0.8860 for the same disease sets, respectively. Clinical evaluations show that RetiZero's Top3 zero-shot performance surpasses the average of 19 ophthalmologists from Singapore, China and the United States. Furthermore, RetiZero significantly enhances clinicians' accuracy in diagnosing fundus disease. These findings underscore the value of integrating the RetiZero foundation model into clinical settings, where a variety of fundus diseases are encountered. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.09317v2-abstract-full').style.display = 'none'; document.getElementById('2406.09317v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.08523">arXiv:2406.08523</a> <span> [<a href="https://arxiv.org/pdf/2406.08523">pdf</a>, <a href="https://arxiv.org/format/2406.08523">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> A Plug-and-Play Untrained Neural Network for Full Waveform Inversion in Reconstructing Sound Speed Images of Ultrasound Computed Tomography </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yan%2C+W">Weicheng Yan</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Q">Qiude Zhang</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+Y">Yun Wu</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+Z">Zhaohui Liu</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+L">Liang Zhou</a>, <a href="/search/eess?searchtype=author&query=Ding%2C+M">Mingyue Ding</a>, <a href="/search/eess?searchtype=author&query=Yuchi%2C+M">Ming Yuchi</a>, <a href="/search/eess?searchtype=author&query=Qiu%2C+W">Wu Qiu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.08523v2-abstract-short" style="display: inline;"> Ultrasound computed tomography (USCT), as an emerging technology, can provide multiple quantitative parametric images of human tissue, such as sound speed and attenuation images, distinguishing it from conventional B-mode (reflection) ultrasound imaging. Full waveform inversion (FWI) is acknowledged as a technique with the greatest potential for reconstructing high-resolution sound speed images in… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.08523v2-abstract-full').style.display = 'inline'; document.getElementById('2406.08523v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.08523v2-abstract-full" style="display: none;"> Ultrasound computed tomography (USCT), as an emerging technology, can provide multiple quantitative parametric images of human tissue, such as sound speed and attenuation images, distinguishing it from conventional B-mode (reflection) ultrasound imaging. Full waveform inversion (FWI) is acknowledged as a technique with the greatest potential for reconstructing high-resolution sound speed images in USCT. However, traditional FWI for sound speed image reconstruction suffers from high sensitivity to the initial model caused by its strong non-convex nonlinearity, resulting in poor performance when ultrasound signals are at high frequencies. This limitation significantly restricts the application of FWI in the USCT imaging field. In this paper, we propose an untrained neural network (UNN) that can be integrated into the traditional iteration-based FWI framework as an implicit regularization prior. This integration allows for seamless deployment as a plug-and-play module within existing FWI algorithms or their variants. Notably, the proposed UNN method can be trained in an unsupervised fashion, a vital aspect in medical imaging where ground truth data is often unavailable. Evaluations of the numerical simulation and phantom experiment of the breast demonstrate that the proposed UNN improves the robustness of image reconstruction, reduces image artifacts, and achieves great image contrast. To the best of our knowledge, this study represents the first attempt to propose an implicit UNN for FWI in reconstructing sound speed images for USCT. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.08523v2-abstract-full').style.display = 'none'; document.getElementById('2406.08523v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.02430">arXiv:2406.02430</a> <span> [<a href="https://arxiv.org/pdf/2406.02430">pdf</a>, <a href="https://arxiv.org/format/2406.02430">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Seed-TTS: A Family of High-Quality Versatile Speech Generation Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Anastassiou%2C+P">Philip Anastassiou</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+J">Jiawei Chen</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+J">Jitong Chen</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yuanzhe Chen</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Z">Zhuo Chen</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Z">Ziyi Chen</a>, <a href="/search/eess?searchtype=author&query=Cong%2C+J">Jian Cong</a>, <a href="/search/eess?searchtype=author&query=Deng%2C+L">Lelai Deng</a>, <a href="/search/eess?searchtype=author&query=Ding%2C+C">Chuang Ding</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+L">Lu Gao</a>, <a href="/search/eess?searchtype=author&query=Gong%2C+M">Mingqing Gong</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+P">Peisong Huang</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+Q">Qingqing Huang</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+Z">Zhiying Huang</a>, <a href="/search/eess?searchtype=author&query=Huo%2C+Y">Yuanyuan Huo</a>, <a href="/search/eess?searchtype=author&query=Jia%2C+D">Dongya Jia</a>, <a href="/search/eess?searchtype=author&query=Li%2C+C">Chumin Li</a>, <a href="/search/eess?searchtype=author&query=Li%2C+F">Feiya Li</a>, <a href="/search/eess?searchtype=author&query=Li%2C+H">Hui Li</a>, <a href="/search/eess?searchtype=author&query=Li%2C+J">Jiaxin Li</a>, <a href="/search/eess?searchtype=author&query=Li%2C+X">Xiaoyang Li</a>, <a href="/search/eess?searchtype=author&query=Li%2C+X">Xingxing Li</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+L">Lin Liu</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+S">Shouda Liu</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+S">Sichao Liu</a> , et al. (21 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.02430v1-abstract-short" style="display: inline;"> We introduce Seed-TTS, a family of large-scale autoregressive text-to-speech (TTS) models capable of generating speech that is virtually indistinguishable from human speech. Seed-TTS serves as a foundation model for speech generation and excels in speech in-context learning, achieving performance in speaker similarity and naturalness that matches ground truth human speech in both objective and sub… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.02430v1-abstract-full').style.display = 'inline'; document.getElementById('2406.02430v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.02430v1-abstract-full" style="display: none;"> We introduce Seed-TTS, a family of large-scale autoregressive text-to-speech (TTS) models capable of generating speech that is virtually indistinguishable from human speech. Seed-TTS serves as a foundation model for speech generation and excels in speech in-context learning, achieving performance in speaker similarity and naturalness that matches ground truth human speech in both objective and subjective evaluations. With fine-tuning, we achieve even higher subjective scores across these metrics. Seed-TTS offers superior controllability over various speech attributes such as emotion and is capable of generating highly expressive and diverse speech for speakers in the wild. Furthermore, we propose a self-distillation method for speech factorization, as well as a reinforcement learning approach to enhance model robustness, speaker similarity, and controllability. We additionally present a non-autoregressive (NAR) variant of the Seed-TTS model, named $\text{Seed-TTS}_\text{DiT}$, which utilizes a fully diffusion-based architecture. Unlike previous NAR-based TTS systems, $\text{Seed-TTS}_\text{DiT}$ does not depend on pre-estimated phoneme durations and performs speech generation through end-to-end processing. We demonstrate that this variant achieves comparable performance to the language model-based variant and showcase its effectiveness in speech editing. We encourage readers to listen to demos at \url{https://bytedancespeech.github.io/seedtts_tech_report}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.02430v1-abstract-full').style.display = 'none'; document.getElementById('2406.02430v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Zhang%2C+Q&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Zhang%2C+Q&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+Q&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+Q&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+Q&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+Q&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+Q&start=250" class="pagination-link " aria-label="Page 6" aria-current="page">6 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+Q&start=300" class="pagination-link " aria-label="Page 7" aria-current="page">7 </a> </li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>