CINXE.COM
Search | arXiv e-print repository
<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 85 results for author: <span class="mathjax">Zhu, G</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> <div class="content"> <form method="GET" action="/search/eess" aria-role="search"> Searching in archive <strong>eess</strong>. <a href="/search/?searchtype=author&query=Zhu%2C+G">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Zhu, G"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Zhu%2C+G&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Zhu, G"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Zhu%2C+G&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Zhu%2C+G&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhu%2C+G&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.06861">arXiv:2412.06861</a> <span> [<a href="https://arxiv.org/pdf/2412.06861">pdf</a>, <a href="https://arxiv.org/format/2412.06861">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Mining Limited Data Sufficiently: A BERT-inspired Approach for CSI Time Series Application in Wireless Communication and Sensing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhao%2C+Z">Zijian Zhao</a>, <a href="/search/eess?searchtype=author&query=Meng%2C+F">Fanyi Meng</a>, <a href="/search/eess?searchtype=author&query=Li%2C+H">Hang Li</a>, <a href="/search/eess?searchtype=author&query=Li%2C+X">Xiaoyang Li</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+G">Guangxu Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.06861v1-abstract-short" style="display: inline;"> Channel State Information (CSI) is the cornerstone in both wireless communication and sensing systems. In wireless communication systems, CSI provides essential insights into channel conditions, enabling system optimizations like channel compensation and dynamic resource allocation. However, the high computational complexity of CSI estimation algorithms necessitates the development of fast deep le… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06861v1-abstract-full').style.display = 'inline'; document.getElementById('2412.06861v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.06861v1-abstract-full" style="display: none;"> Channel State Information (CSI) is the cornerstone in both wireless communication and sensing systems. In wireless communication systems, CSI provides essential insights into channel conditions, enabling system optimizations like channel compensation and dynamic resource allocation. However, the high computational complexity of CSI estimation algorithms necessitates the development of fast deep learning methods for CSI prediction. In wireless sensing systems, CSI can be leveraged to infer environmental changes, facilitating various functions, including gesture recognition and people identification. Deep learning methods have demonstrated significant advantages over model-based approaches in these fine-grained CSI classification tasks, particularly when classes vary across different scenarios. However, a major challenge in training deep learning networks for wireless systems is the limited availability of data, further complicated by the diverse formats of many public datasets, which hinder integration. Additionally, collecting CSI data can be resource-intensive, requiring considerable time and manpower. To address these challenges, we propose CSI-BERT2 for CSI prediction and classification tasks, effectively utilizing limited data through a pre-training and fine-tuning approach. Building on CSI-BERT1, we enhance the model architecture by introducing an Adaptive Re-Weighting Layer (ARL) and a Multi-Layer Perceptron (MLP) to better capture sub-carrier and timestamp information, effectively addressing the permutation-invariance problem. Furthermore, we propose a Mask Prediction Model (MPM) fine-tuning method to improve the model's adaptability for CSI prediction tasks. Experimental results demonstrate that CSI-BERT2 achieves state-of-the-art performance across all tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06861v1-abstract-full').style.display = 'none'; document.getElementById('2412.06861v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.05074">arXiv:2412.05074</a> <span> [<a href="https://arxiv.org/pdf/2412.05074">pdf</a>, <a href="https://arxiv.org/format/2412.05074">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> LoFi: Vision-Aided Label Generator for Wi-Fi Localization and Tracking </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhao%2C+Z">Zijian Zhao</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+T">Tingwei Chen</a>, <a href="/search/eess?searchtype=author&query=Meng%2C+F">Fanyi Meng</a>, <a href="/search/eess?searchtype=author&query=Cai%2C+Z">Zhijie Cai</a>, <a href="/search/eess?searchtype=author&query=Li%2C+H">Hang Li</a>, <a href="/search/eess?searchtype=author&query=Li%2C+X">Xiaoyang Li</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+G">Guangxu Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.05074v1-abstract-short" style="display: inline;"> Wi-Fi localization and tracking has shown immense potential due to its privacy-friendliness, wide coverage, permeability, independence from lighting conditions, and low cost. Current methods can be broadly categorized as model-based and data-driven approaches, where data-driven methods show better performance and have less requirement for specialized devices, but struggle with limited datasets for… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05074v1-abstract-full').style.display = 'inline'; document.getElementById('2412.05074v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.05074v1-abstract-full" style="display: none;"> Wi-Fi localization and tracking has shown immense potential due to its privacy-friendliness, wide coverage, permeability, independence from lighting conditions, and low cost. Current methods can be broadly categorized as model-based and data-driven approaches, where data-driven methods show better performance and have less requirement for specialized devices, but struggle with limited datasets for training. Due to limitations in current data collection methods, most datasets only provide coarse-grained ground truth (GT) or limited amount of label points, which greatly hinders the development of data-driven methods. Even though lidar can provide accurate GT, their high cost makes them inaccessible to many users. To address these challenges, we propose LoFi, a vision-aided label generator for Wi-Fi localization and tracking, which can generate ground truth position coordinates solely based on 2D images. The easy and quick data collection method also helps data-driven based methods deploy in practice, since Wi-Fi is a low-generalization modality and when using relevant methods, it always requires fine-tuning the model using newly collected data. Based on our method, we also collect a Wi-Fi tracking and localization dataset using ESP32-S3 and a webcam. To facilitate future research, we will make our code and dataset publicly available upon publication. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05074v1-abstract-full').style.display = 'none'; document.getElementById('2412.05074v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.04783">arXiv:2412.04783</a> <span> [<a href="https://arxiv.org/pdf/2412.04783">pdf</a>, <a href="https://arxiv.org/format/2412.04783">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> KNN-MMD: Cross Domain Wi-Fi Sensing Based on Local Distribution Alignment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhao%2C+Z">Zijian Zhao</a>, <a href="/search/eess?searchtype=author&query=Cai%2C+Z">Zhijie Cai</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+T">Tingwei Chen</a>, <a href="/search/eess?searchtype=author&query=Li%2C+X">Xiaoyang Li</a>, <a href="/search/eess?searchtype=author&query=Li%2C+H">Hang Li</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+G">Guangxu Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.04783v1-abstract-short" style="display: inline;"> As a key technology in Integrated Sensing and Communications (ISAC), Wi-Fi sensing has gained widespread application in various settings such as homes, offices, and public spaces. By analyzing the patterns of Channel State Information (CSI), we can obtain information about people's actions for tasks like person identification, gesture recognition, and fall detection. However, the CSI is heavily in… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.04783v1-abstract-full').style.display = 'inline'; document.getElementById('2412.04783v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.04783v1-abstract-full" style="display: none;"> As a key technology in Integrated Sensing and Communications (ISAC), Wi-Fi sensing has gained widespread application in various settings such as homes, offices, and public spaces. By analyzing the patterns of Channel State Information (CSI), we can obtain information about people's actions for tasks like person identification, gesture recognition, and fall detection. However, the CSI is heavily influenced by the environment, such that even minor environmental changes can significantly alter the CSI patterns. This will cause the performance deterioration and even failure when applying the Wi-Fi sensing model trained in one environment to another. To address this problem, we introduce a K-Nearest Neighbors Maximum Mean Discrepancy (KNN-MMD) model, a few-shot method for cross-domain Wi-Fi sensing. We propose a local distribution alignment method within each category, which outperforms traditional Domain Adaptation (DA) methods based on global alignment. Besides, our method can determine when to stop training, which cannot be realized by most DA methods. As a result, our method is more stable and can be better used in practice. The effectiveness of our method are evaluated in several cross-domain Wi-Fi sensing tasks, including gesture recognition, person identification, fall detection, and action recognition, using both a public dataset and a self-collected dataset. In one-shot scenario, our method achieves accuracy of 93.26%, 81.84%, 77.62%, and 75.30% in the four tasks respectively. To facilitate future research, we will make our code and dataset publicly available upon publication. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.04783v1-abstract-full').style.display = 'none'; document.getElementById('2412.04783v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11863">arXiv:2411.11863</a> <span> [<a href="https://arxiv.org/pdf/2411.11863">pdf</a>, <a href="https://arxiv.org/ps/2411.11863">ps</a>, <a href="https://arxiv.org/format/2411.11863">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Longitudinal Wrist PPG Analysis for Reliable Hypertension Risk Screening Using Deep Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Lin%2C+H">Hui Lin</a>, <a href="/search/eess?searchtype=author&query=Li%2C+J">Jiyang Li</a>, <a href="/search/eess?searchtype=author&query=Hussein%2C+R">Ramy Hussein</a>, <a href="/search/eess?searchtype=author&query=Sui%2C+X">Xin Sui</a>, <a href="/search/eess?searchtype=author&query=Li%2C+X">Xiaoyu Li</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+G">Guangpu Zhu</a>, <a href="/search/eess?searchtype=author&query=Katsaggelos%2C+A+K">Aggelos K. Katsaggelos</a>, <a href="/search/eess?searchtype=author&query=Zeng%2C+Z">Zijing Zeng</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yelei Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11863v1-abstract-short" style="display: inline;"> Hypertension is a leading risk factor for cardiovascular diseases. Traditional blood pressure monitoring methods are cumbersome and inadequate for continuous tracking, prompting the development of PPG-based cuffless blood pressure monitoring wearables. This study leverages deep learning models, including ResNet and Transformer, to analyze wrist PPG data collected with a smartwatch for efficient hy… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11863v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11863v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11863v1-abstract-full" style="display: none;"> Hypertension is a leading risk factor for cardiovascular diseases. Traditional blood pressure monitoring methods are cumbersome and inadequate for continuous tracking, prompting the development of PPG-based cuffless blood pressure monitoring wearables. This study leverages deep learning models, including ResNet and Transformer, to analyze wrist PPG data collected with a smartwatch for efficient hypertension risk screening, eliminating the need for handcrafted PPG features. Using the Home Blood Pressure Monitoring (HBPM) longitudinal dataset of 448 subjects and five-fold cross-validation, our model was trained on over 68k spot-check instances from 358 subjects and tested on real-world continuous recordings of 90 subjects. The compact ResNet model with 0.124M parameters performed significantly better than traditional machine learning methods, demonstrating its effectiveness in distinguishing between healthy and abnormal cases in real-world scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11863v1-abstract-full').style.display = 'none'; document.getElementById('2411.11863v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">blood pressure, hypertension, cuffless, photoplethysmography, deep learning</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.05167">arXiv:2410.05167</a> <span> [<a href="https://arxiv.org/pdf/2410.05167">pdf</a>, <a href="https://arxiv.org/format/2410.05167">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Presto! Distilling Steps and Layers for Accelerating Music Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Novack%2C+Z">Zachary Novack</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+G">Ge Zhu</a>, <a href="/search/eess?searchtype=author&query=Casebeer%2C+J">Jonah Casebeer</a>, <a href="/search/eess?searchtype=author&query=McAuley%2C+J">Julian McAuley</a>, <a href="/search/eess?searchtype=author&query=Berg-Kirkpatrick%2C+T">Taylor Berg-Kirkpatrick</a>, <a href="/search/eess?searchtype=author&query=Bryan%2C+N+J">Nicholas J. Bryan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.05167v1-abstract-short" style="display: inline;"> Despite advances in diffusion-based text-to-music (TTM) methods, efficient, high-quality generation remains a challenge. We introduce Presto!, an approach to inference acceleration for score-based diffusion transformers via reducing both sampling steps and cost per step. To reduce steps, we develop a new score-based distribution matching distillation (DMD) method for the EDM-family of diffusion mo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05167v1-abstract-full').style.display = 'inline'; document.getElementById('2410.05167v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.05167v1-abstract-full" style="display: none;"> Despite advances in diffusion-based text-to-music (TTM) methods, efficient, high-quality generation remains a challenge. We introduce Presto!, an approach to inference acceleration for score-based diffusion transformers via reducing both sampling steps and cost per step. To reduce steps, we develop a new score-based distribution matching distillation (DMD) method for the EDM-family of diffusion models, the first GAN-based distillation method for TTM. To reduce the cost per step, we develop a simple, but powerful improvement to a recent layer distillation method that improves learning via better preserving hidden state variance. Finally, we combine our step and layer distillation methods together for a dual-faceted approach. We evaluate our step and layer distillation methods independently and show each yield best-in-class performance. Our combined distillation method can generate high-quality outputs with improved diversity, accelerating our base model by 10-18x (230/435ms latency for 32 second mono/stereo 44.1kHz, 15x faster than comparable SOTA) -- the fastest high-quality TTM to our knowledge. Sound examples can be found at https://presto-music.github.io/web/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05167v1-abstract-full').style.display = 'none'; document.getElementById('2410.05167v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.01330">arXiv:2410.01330</a> <span> [<a href="https://arxiv.org/pdf/2410.01330">pdf</a>, <a href="https://arxiv.org/ps/2410.01330">ps</a>, <a href="https://arxiv.org/format/2410.01330">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Enhancing User Fairness in Wireless Powered Communication Networks with STAR-RIS </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhu%2C+G">Guangyu Zhu</a>, <a href="/search/eess?searchtype=author&query=Mu%2C+X">Xidong Mu</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+L">Li Guo</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+A">Ao Huang</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+S">Shibiao Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.01330v1-abstract-short" style="display: inline;"> A simultaneously transmitting and reflecting reconfigurable intelligent surface (STAR-RIS) assisted wireless powered communication network (WPCN) is proposed, where two energy-limited devices first harvest energy from a hybrid access point (HAP) and then use that energy to transmit information back. To fully eliminate the doubly-near-far effect in WPCNs, two STAR-RIS operating protocol-driven tran… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01330v1-abstract-full').style.display = 'inline'; document.getElementById('2410.01330v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.01330v1-abstract-full" style="display: none;"> A simultaneously transmitting and reflecting reconfigurable intelligent surface (STAR-RIS) assisted wireless powered communication network (WPCN) is proposed, where two energy-limited devices first harvest energy from a hybrid access point (HAP) and then use that energy to transmit information back. To fully eliminate the doubly-near-far effect in WPCNs, two STAR-RIS operating protocol-driven transmission strategies, namely energy splitting non-orthogonal multiple access (ES- NOMA) and time switching time division multiple access (TS- TDMA) are proposed. For each strategy, the corresponding optimization problem is formulated to maximize the minimum throughput by jointly optimizing time allocation, user transmit power, active HAP beamforming, and passive STAR-RIS beamforming. For ES-NOMA, the resulting intractable problem is solved via a two-layer algorithm, which exploits the one-dimensional search and block coordinate descent methods in an iterative manner. For TS-TDMA, the optimal active beamforming and passive beamforming are first determined according to the maximum-ratio transmission beamformer. Then, the optimal solution of the time allocation variables is obtained by solving a standard convex problem. Numerical results show that: 1) the STAR-RIS can achieve considerable performance improvements for both strategies compared to the conventional RIS; 2) TS- TDMA is preferred for single-antenna scenarios, whereas ES- NOMA is better suited for multi-antenna scenarios; and 3) the superiority of ES-NOMA over TS-TDMA is enhanced as the number of STAR-RIS elements increases. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01330v1-abstract-full').style.display = 'none'; document.getElementById('2410.01330v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.11849">arXiv:2408.11849</a> <span> [<a href="https://arxiv.org/pdf/2408.11849">pdf</a>, <a href="https://arxiv.org/format/2408.11849">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Style-Talker: Finetuning Audio Language Model and Style-Based Text-to-Speech Model for Fast Spoken Dialogue Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+Y+A">Yinghao Aaron Li</a>, <a href="/search/eess?searchtype=author&query=Jiang%2C+X">Xilin Jiang</a>, <a href="/search/eess?searchtype=author&query=Darefsky%2C+J">Jordan Darefsky</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+G">Ge Zhu</a>, <a href="/search/eess?searchtype=author&query=Mesgarani%2C+N">Nima Mesgarani</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.11849v1-abstract-short" style="display: inline;"> The rapid advancement of large language models (LLMs) has significantly propelled the development of text-based chatbots, demonstrating their capability to engage in coherent and contextually relevant dialogues. However, extending these advancements to enable end-to-end speech-to-speech conversation bots remains a formidable challenge, primarily due to the extensive dataset and computational resou… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.11849v1-abstract-full').style.display = 'inline'; document.getElementById('2408.11849v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.11849v1-abstract-full" style="display: none;"> The rapid advancement of large language models (LLMs) has significantly propelled the development of text-based chatbots, demonstrating their capability to engage in coherent and contextually relevant dialogues. However, extending these advancements to enable end-to-end speech-to-speech conversation bots remains a formidable challenge, primarily due to the extensive dataset and computational resources required. The conventional approach of cascading automatic speech recognition (ASR), LLM, and text-to-speech (TTS) models in a pipeline, while effective, suffers from unnatural prosody because it lacks direct interactions between the input audio and its transcribed text and the output audio. These systems are also limited by their inherent latency from the ASR process for real-time applications. This paper introduces Style-Talker, an innovative framework that fine-tunes an audio LLM alongside a style-based TTS model for fast spoken dialog generation. Style-Talker takes user input audio and uses transcribed chat history and speech styles to generate both the speaking style and text for the response. Subsequently, the TTS model synthesizes the speech, which is then played back to the user. While the response speech is being played, the input speech undergoes ASR processing to extract the transcription and speaking style, serving as the context for the ensuing dialogue turn. This novel pipeline accelerates the traditional cascade ASR-LLM-TTS systems while integrating rich paralinguistic information from input speech. Our experimental results show that Style-Talker significantly outperforms the conventional cascade and speech-to-speech baselines in terms of both dialogue naturalness and coherence while being more than 50% faster. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.11849v1-abstract-full').style.display = 'none'; document.getElementById('2408.11849v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CoLM 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.10919">arXiv:2408.10919</a> <span> [<a href="https://arxiv.org/pdf/2408.10919">pdf</a>, <a href="https://arxiv.org/format/2408.10919">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> CrossFi: A Cross Domain Wi-Fi Sensing Framework Based on Siamese Network </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhao%2C+Z">Zijian Zhao</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+T">Tingwei Chen</a>, <a href="/search/eess?searchtype=author&query=Cai%2C+Z">Zhijie Cai</a>, <a href="/search/eess?searchtype=author&query=Li%2C+X">Xiaoyang Li</a>, <a href="/search/eess?searchtype=author&query=Li%2C+H">Hang Li</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Q">Qimei Chen</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+G">Guangxu Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.10919v2-abstract-short" style="display: inline;"> In recent years, Wi-Fi sensing has garnered significant attention due to its numerous benefits, such as privacy protection, low cost, and penetration ability. Extensive research has been conducted in this field, focusing on areas such as gesture recognition, people identification, and fall detection. However, many data-driven methods encounter challenges related to domain shift, where the model fa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10919v2-abstract-full').style.display = 'inline'; document.getElementById('2408.10919v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.10919v2-abstract-full" style="display: none;"> In recent years, Wi-Fi sensing has garnered significant attention due to its numerous benefits, such as privacy protection, low cost, and penetration ability. Extensive research has been conducted in this field, focusing on areas such as gesture recognition, people identification, and fall detection. However, many data-driven methods encounter challenges related to domain shift, where the model fails to perform well in environments different from the training data. One major factor contributing to this issue is the limited availability of Wi-Fi sensing datasets, which makes models learn excessive irrelevant information and over-fit to the training set. Unfortunately, collecting large-scale Wi-Fi sensing datasets across diverse scenarios is a challenging task. To address this problem, we propose CrossFi, a siamese network-based approach that excels in both in-domain scenario and cross-domain scenario, including few-shot, zero-shot scenarios, and even works in few-shot new-class scenario where testing set contains new categories. The core component of CrossFi is a sample-similarity calculation network called CSi-Net, which improves the structure of the siamese network by using an attention mechanism to capture similarity information, instead of simply calculating the distance or cosine similarity. Based on it, we develop an extra Weight-Net that can generate a template for each class, so that our CrossFi can work in different scenarios. Experimental results demonstrate that our CrossFi achieves state-of-the-art performance across various scenarios. In gesture recognition task, our CrossFi achieves an accuracy of 98.17% in in-domain scenario, 91.72% in one-shot cross-domain scenario, 64.81% in zero-shot cross-domain scenario, and 84.75% in one-shot new-class scenario. To facilitate future research, we will release the code for our model upon publication. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10919v2-abstract-full').style.display = 'none'; document.getElementById('2408.10919v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.10378">arXiv:2408.10378</a> <span> [<a href="https://arxiv.org/pdf/2408.10378">pdf</a>, <a href="https://arxiv.org/format/2408.10378">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Finite-time input-to-state stability for infinite-dimensional systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Sun%2C+X">Xiaorong Sun</a>, <a href="/search/eess?searchtype=author&query=Zheng%2C+J">Jun Zheng</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+G">Guchuan Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.10378v1-abstract-short" style="display: inline;"> In this paper, we extend the notion of finite-time input-to-state stability (FTISS) for finite-dimensional systems to infinite-dimensional systems. More specifically, we first prove an FTISS Lyapunov theorem for a class of infinite-dimensional systems, namely, the existence of an FTISS Lyapunov functional (FTISS-LF) implies the FTISS of the system, and then, provide a sufficient condition for ensu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10378v1-abstract-full').style.display = 'inline'; document.getElementById('2408.10378v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.10378v1-abstract-full" style="display: none;"> In this paper, we extend the notion of finite-time input-to-state stability (FTISS) for finite-dimensional systems to infinite-dimensional systems. More specifically, we first prove an FTISS Lyapunov theorem for a class of infinite-dimensional systems, namely, the existence of an FTISS Lyapunov functional (FTISS-LF) implies the FTISS of the system, and then, provide a sufficient condition for ensuring the existence of an FTISS-LF for a class of abstract infinite-dimensional systems under the framework of compact semigroup theory and Hilbert spaces. As an application of the FTISS Lyapunov theorem, we verify the FTISS for a class of parabolic PDEs involving sublinear terms and distributed in-domain disturbances. Since the nonlinear terms of the corresponding abstract system are not Lipschitz continuous, the well-posedness is proved based on the application of compact semigroup theory and the FTISS is assessed by using the Lyapunov method with the aid of an interpolation inequality. Numerical simulations are conducted to confirm the theoretical results. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10378v1-abstract-full').style.display = 'none'; document.getElementById('2408.10378v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.04928">arXiv:2407.04928</a> <span> [<a href="https://arxiv.org/pdf/2407.04928">pdf</a>, <a href="https://arxiv.org/format/2407.04928">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> CLIPVQA:Video Quality Assessment via CLIP </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Xing%2C+F">Fengchuang Xing</a>, <a href="/search/eess?searchtype=author&query=Li%2C+M">Mingjie Li</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yuan-Gen Wang</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+G">Guopu Zhu</a>, <a href="/search/eess?searchtype=author&query=Cao%2C+X">Xiaochun Cao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.04928v1-abstract-short" style="display: inline;"> In learning vision-language representations from web-scale data, the contrastive language-image pre-training (CLIP) mechanism has demonstrated a remarkable performance in many vision tasks. However, its application to the widely studied video quality assessment (VQA) task is still an open issue. In this paper, we propose an efficient and effective CLIP-based Transformer method for the VQA problem… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.04928v1-abstract-full').style.display = 'inline'; document.getElementById('2407.04928v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.04928v1-abstract-full" style="display: none;"> In learning vision-language representations from web-scale data, the contrastive language-image pre-training (CLIP) mechanism has demonstrated a remarkable performance in many vision tasks. However, its application to the widely studied video quality assessment (VQA) task is still an open issue. In this paper, we propose an efficient and effective CLIP-based Transformer method for the VQA problem (CLIPVQA). Specifically, we first design an effective video frame perception paradigm with the goal of extracting the rich spatiotemporal quality and content information among video frames. Then, the spatiotemporal quality features are adequately integrated together using a self-attention mechanism to yield video-level quality representation. To utilize the quality language descriptions of videos for supervision, we develop a CLIP-based encoder for language embedding, which is then fully aggregated with the generated content information via a cross-attention module for producing video-language representation. Finally, the video-level quality and video-language representations are fused together for final video quality prediction, where a vectorized regression loss is employed for efficient end-to-end optimization. Comprehensive experiments are conducted on eight in-the-wild video datasets with diverse resolutions to evaluate the performance of CLIPVQA. The experimental results show that the proposed CLIPVQA achieves new state-of-the-art VQA performance and up to 37% better generalizability than existing benchmark VQA methods. A series of ablation studies are also performed to validate the effectiveness of each module in CLIPVQA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.04928v1-abstract-full').style.display = 'none'; document.getElementById('2407.04928v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.00955">arXiv:2407.00955</a> <span> [<a href="https://arxiv.org/pdf/2407.00955">pdf</a>, <a href="https://arxiv.org/format/2407.00955">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Task-oriented Over-the-air Computation for Edge-device Co-inference with Balanced Classification Accuracy </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Jiao%2C+X">Xiang Jiao</a>, <a href="/search/eess?searchtype=author&query=Wen%2C+D">Dingzhu Wen</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+G">Guangxu Zhu</a>, <a href="/search/eess?searchtype=author&query=Jiang%2C+W">Wei Jiang</a>, <a href="/search/eess?searchtype=author&query=Luo%2C+W">Wu Luo</a>, <a href="/search/eess?searchtype=author&query=Shi%2C+Y">Yuanming Shi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.00955v1-abstract-short" style="display: inline;"> Edge-device co-inference, which concerns the cooperation between edge devices and an edge server for completing inference tasks over wireless networks, has been a promising technique for enabling various kinds of intelligent services at the network edge, e.g., auto-driving. In this paradigm, the concerned design objective of the network shifts from the traditional communication throughput to the e… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.00955v1-abstract-full').style.display = 'inline'; document.getElementById('2407.00955v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.00955v1-abstract-full" style="display: none;"> Edge-device co-inference, which concerns the cooperation between edge devices and an edge server for completing inference tasks over wireless networks, has been a promising technique for enabling various kinds of intelligent services at the network edge, e.g., auto-driving. In this paradigm, the concerned design objective of the network shifts from the traditional communication throughput to the effective and efficient execution of the inference task underpinned by the network, measured by, e.g., the inference accuracy and latency. In this paper, a task-oriented over-the-air computation scheme is proposed for a multidevice artificial intelligence system. Particularly, a novel tractable inference accuracy metric is proposed for classification tasks, which is called minimum pair-wise discriminant gain. Unlike prior work measuring the average of all class pairs in feature space, it measures the minimum distance of all class pairs. By maximizing the minimum pair-wise discriminant gain instead of its average counterpart, any pair of classes can be better separated in the feature space, and thus leading to a balanced and improved inference accuracy for all classes. Besides, this paper jointly optimizes the minimum discriminant gain of all feature elements instead of separately maximizing that of each element in the existing designs. As a result, the transmit power can be adaptively allocated to the feature elements according to their different contributions to the inference accuracy, opening an extra degree of freedom to improve inference performance. Extensive experiments are conducted using a concrete use case of human motion recognition to verify the superiority of the proposed design over the benchmarking scheme. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.00955v1-abstract-full').style.display = 'none'; document.getElementById('2407.00955v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper was accepted by IEEE Transactions on Vehicular Technology on June 30, 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.00297">arXiv:2407.00297</a> <span> [<a href="https://arxiv.org/pdf/2407.00297">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> UADSN: Uncertainty-Aware Dual-Stream Network for Facial Nerve Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhu%2C+G">Guanghao Zhu</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+L">Lin Liu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+J">Jing Zhang</a>, <a href="/search/eess?searchtype=author&query=Du%2C+X">Xiaohui Du</a>, <a href="/search/eess?searchtype=author&query=Hao%2C+R">Ruqian Hao</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+J">Juanxiu Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.00297v1-abstract-short" style="display: inline;"> Facial nerve segmentation is crucial for preoperative path planning in cochlear implantation surgery. Recently, researchers have proposed some segmentation methods, such as atlas-based and deep learning-based methods. However, since the facial nerve is a tubular organ with a diameter of only 1.0-1.5mm, it is challenging to locate and segment the facial nerve in CT scans. In this work, we propose a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.00297v1-abstract-full').style.display = 'inline'; document.getElementById('2407.00297v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.00297v1-abstract-full" style="display: none;"> Facial nerve segmentation is crucial for preoperative path planning in cochlear implantation surgery. Recently, researchers have proposed some segmentation methods, such as atlas-based and deep learning-based methods. However, since the facial nerve is a tubular organ with a diameter of only 1.0-1.5mm, it is challenging to locate and segment the facial nerve in CT scans. In this work, we propose an uncertainty-aware dualstream network (UADSN). UADSN consists of a 2D segmentation stream and a 3D segmentation stream. Predictions from two streams are used to identify uncertain regions, and a consistency loss is employed to supervise the segmentation of these regions. In addition, we introduce channel squeeze & spatial excitation modules into the skip connections of U-shaped networks to extract meaningful spatial information. In order to consider topologypreservation, a clDice loss is introduced into the supervised loss function. Experimental results on the facial nerve dataset demonstrate the effectiveness of UADSN and our submodules. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.00297v1-abstract-full').style.display = 'none'; document.getElementById('2407.00297v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.19649">arXiv:2406.19649</a> <span> [<a href="https://arxiv.org/pdf/2406.19649">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> AstMatch: Adversarial Self-training Consistency Framework for Semi-Supervised Medical Image Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhu%2C+G">Guanghao Zhu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+J">Jing Zhang</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+J">Juanxiu Liu</a>, <a href="/search/eess?searchtype=author&query=Du%2C+X">Xiaohui Du</a>, <a href="/search/eess?searchtype=author&query=Hao%2C+R">Ruqian Hao</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+Y">Yong Liu</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+L">Lin Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.19649v1-abstract-short" style="display: inline;"> Semi-supervised learning (SSL) has shown considerable potential in medical image segmentation, primarily leveraging consistency regularization and pseudo-labeling. However, many SSL approaches only pay attention to low-level consistency and overlook the significance of pseudo-label reliability. Therefore, in this work, we propose an adversarial self-training consistency framework (AstMatch). First… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.19649v1-abstract-full').style.display = 'inline'; document.getElementById('2406.19649v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.19649v1-abstract-full" style="display: none;"> Semi-supervised learning (SSL) has shown considerable potential in medical image segmentation, primarily leveraging consistency regularization and pseudo-labeling. However, many SSL approaches only pay attention to low-level consistency and overlook the significance of pseudo-label reliability. Therefore, in this work, we propose an adversarial self-training consistency framework (AstMatch). Firstly, we design an adversarial consistency regularization (ACR) approach to enhance knowledge transfer and strengthen prediction consistency under varying perturbation intensities. Second, we apply a feature matching loss for adversarial training to incorporate high-level consistency regularization. Additionally, we present the pyramid channel attention (PCA) and efficient channel and spatial attention (ECSA) modules to improve the discriminator's performance. Finally, we propose an adaptive self-training (AST) approach to ensure the pseudo-labels' quality. The proposed AstMatch has been extensively evaluated with cutting-edge SSL methods on three public-available datasets. The experimental results under different labeled ratios indicate that AstMatch outperforms other existing methods, achieving new state-of-the-art performance. Our code will be available at https://github.com/GuanghaoZhu663/AstMatch. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.19649v1-abstract-full').style.display = 'none'; document.getElementById('2406.19649v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.16967">arXiv:2406.16967</a> <span> [<a href="https://arxiv.org/pdf/2406.16967">pdf</a>, <a href="https://arxiv.org/format/2406.16967">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Remaining useful life prediction of rolling bearings based on refined composite multi-scale attention entropy and dispersion entropy </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Long%2C+Y">Yunchong Long</a>, <a href="/search/eess?searchtype=author&query=Pang%2C+Q">Qinkang Pang</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+G">Guangjie Zhu</a>, <a href="/search/eess?searchtype=author&query=Cheng%2C+J">Junxian Cheng</a>, <a href="/search/eess?searchtype=author&query=Li%2C+X">Xiangshun Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.16967v1-abstract-short" style="display: inline;"> Remaining useful life (RUL) prediction based on vibration signals is crucial for ensuring the safe operation and effective health management of rotating machinery. Existing studies often extract health indicators (HI) from time domain and frequency domain features to analyze complex vibration signals, but these features may not accurately capture the degradation process. In this study, we propose… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.16967v1-abstract-full').style.display = 'inline'; document.getElementById('2406.16967v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.16967v1-abstract-full" style="display: none;"> Remaining useful life (RUL) prediction based on vibration signals is crucial for ensuring the safe operation and effective health management of rotating machinery. Existing studies often extract health indicators (HI) from time domain and frequency domain features to analyze complex vibration signals, but these features may not accurately capture the degradation process. In this study, we propose a degradation feature extraction method called Fusion of Multi-Modal Multi-Scale Entropy (FMME), which utilizes multi-modal Refined Composite Multi-scale Attention Entropy (RCMATE) and Fluctuation Dispersion Entropy (RCMFDE), to solve the problem that the existing degradation features cannot accurately reflect the degradation process. Firstly, the Empirical Mode Decomposition (EMD) is employed to decompose the dual-channel vibration signals of bearings into multiple modals. The main modals are then selected for further analysis. The subsequent step involves the extraction of RCMATE and RCMFDE from each modal, followed by wavelet denoising. Next, a novel metric is proposed to evaluate the quality of degradation features. The attention entropy and dispersion entropy of the optimal scales under different modals are fused using Laplacian Eigenmap (LE) to obtain the health indicators. Finally, RUL prediction is performed through the similarity of health indicators between fault samples and bearings to be predicted. Experimental results demonstrate that the proposed method yields favorable outcomes across diverse operating conditions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.16967v1-abstract-full').style.display = 'none'; document.getElementById('2406.16967v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12pages, 9 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.16929">arXiv:2406.16929</a> <span> [<a href="https://arxiv.org/pdf/2406.16929">pdf</a>, <a href="https://arxiv.org/format/2406.16929">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Modelling the 5G Energy Consumption using Real-world Data: Energy Fingerprint is All You Need </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Chen%2C+T">Tingwei Chen</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yantao Wang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+H">Hanzhi Chen</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+Z">Zijian Zhao</a>, <a href="/search/eess?searchtype=author&query=Li%2C+X">Xinhao Li</a>, <a href="/search/eess?searchtype=author&query=Piovesan%2C+N">Nicola Piovesan</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+G">Guangxu Zhu</a>, <a href="/search/eess?searchtype=author&query=Shi%2C+Q">Qingjiang Shi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.16929v1-abstract-short" style="display: inline;"> The introduction of fifth-generation (5G) radio technology has revolutionized communications, bringing unprecedented automation, capacity, connectivity, and ultra-fast, reliable communications. However, this technological leap comes with a substantial increase in energy consumption, presenting a significant challenge. To improve the energy efficiency of 5G networks, it is imperative to develop sop… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.16929v1-abstract-full').style.display = 'inline'; document.getElementById('2406.16929v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.16929v1-abstract-full" style="display: none;"> The introduction of fifth-generation (5G) radio technology has revolutionized communications, bringing unprecedented automation, capacity, connectivity, and ultra-fast, reliable communications. However, this technological leap comes with a substantial increase in energy consumption, presenting a significant challenge. To improve the energy efficiency of 5G networks, it is imperative to develop sophisticated models that accurately reflect the influence of base station (BS) attributes and operational conditions on energy usage.Importantly, addressing the complexity and interdependencies of these diverse features is particularly challenging, both in terms of data processing and model architecture design. This paper proposes a novel 5G base stations energy consumption modelling method by learning from a real-world dataset used in the ITU 5G Base Station Energy Consumption Modelling Challenge in which our model ranked second. Unlike existing methods that omit the Base Station Identifier (BSID) information and thus fail to capture the unique energy fingerprint in different base stations, we incorporate the BSID into the input features and encoding it with an embedding layer for precise representation. Additionally, we introduce a novel masked training method alongside an attention mechanism to further boost the model's generalization capabilities and accuracy. After evaluation, our method demonstrates significant improvements over existing models, reducing Mean Absolute Percentage Error (MAPE) from 12.75% to 4.98%, leading to a performance gain of more than 60%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.16929v1-abstract-full').style.display = 'none'; document.getElementById('2406.16929v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.19182">arXiv:2404.19182</a> <span> [<a href="https://arxiv.org/pdf/2404.19182">pdf</a>, <a href="https://arxiv.org/format/2404.19182">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Robust Proximity Detection using On-Device Gait Monitoring </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Hu%2C+Y">Yuqian Hu</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+G">Guozhen Zhu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+B">Beibei Wang</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+K+J+R">K. J. Ray Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.19182v1-abstract-short" style="display: inline;"> Proximity detection in indoor environments based on WiFi signals has gained significant attention in recent years. Existing works rely on the dynamic signal reflections and their extracted features are dependent on motion strength. To address this issue, we design a robust WiFi-based proximity detector by considering gait monitoring. Specifically, we propose a gait score that accurately evaluates… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.19182v1-abstract-full').style.display = 'inline'; document.getElementById('2404.19182v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.19182v1-abstract-full" style="display: none;"> Proximity detection in indoor environments based on WiFi signals has gained significant attention in recent years. Existing works rely on the dynamic signal reflections and their extracted features are dependent on motion strength. To address this issue, we design a robust WiFi-based proximity detector by considering gait monitoring. Specifically, we propose a gait score that accurately evaluates gait presence by leveraging the speed estimated from the autocorrelation function (ACF) of channel state information (CSI). By combining this gait score with a proximity feature, our approach effectively distinguishes different transition patterns, enabling more reliable proximity detection. In addition, to enhance the stability of the detection process, we employ a state machine and extract temporal information, ensuring continuous proximity detection even during subtle movements. Extensive experiments conducted in different environments demonstrate an overall detection rate of 92.5% and a low false alarm rate of 1.12% with a delay of 0.825s. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.19182v1-abstract-full').style.display = 'none'; document.getElementById('2404.19182v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This work has been accepted in IEEE 9th World Forum on Internet of Things (WFIoT)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.06007">arXiv:2404.06007</a> <span> [<a href="https://arxiv.org/pdf/2404.06007">pdf</a>, <a href="https://arxiv.org/format/2404.06007">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Collaborative Edge AI Inference over Cloud-RAN </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+P">Pengfei Zhang</a>, <a href="/search/eess?searchtype=author&query=Wen%2C+D">Dingzhu Wen</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+G">Guangxu Zhu</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Q">Qimei Chen</a>, <a href="/search/eess?searchtype=author&query=Han%2C+K">Kaifeng Han</a>, <a href="/search/eess?searchtype=author&query=Shi%2C+Y">Yuanming Shi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.06007v1-abstract-short" style="display: inline;"> In this paper, a cloud radio access network (Cloud-RAN) based collaborative edge AI inference architecture is proposed. Specifically, geographically distributed devices capture real-time noise-corrupted sensory data samples and extract the noisy local feature vectors, which are then aggregated at each remote radio head (RRH) to suppress sensing noise. To realize efficient uplink feature aggregatio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.06007v1-abstract-full').style.display = 'inline'; document.getElementById('2404.06007v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.06007v1-abstract-full" style="display: none;"> In this paper, a cloud radio access network (Cloud-RAN) based collaborative edge AI inference architecture is proposed. Specifically, geographically distributed devices capture real-time noise-corrupted sensory data samples and extract the noisy local feature vectors, which are then aggregated at each remote radio head (RRH) to suppress sensing noise. To realize efficient uplink feature aggregation, we allow each RRH receives local feature vectors from all devices over the same resource blocks simultaneously by leveraging an over-the-air computation (AirComp) technique. Thereafter, these aggregated feature vectors are quantized and transmitted to a central processor (CP) for further aggregation and downstream inference tasks. Our aim in this work is to maximize the inference accuracy via a surrogate accuracy metric called discriminant gain, which measures the discernibility of different classes in the feature space. The key challenges lie on simultaneously suppressing the coupled sensing noise, AirComp distortion caused by hostile wireless channels, and the quantization error resulting from the limited capacity of fronthaul links. To address these challenges, this work proposes a joint transmit precoding, receive beamforming, and quantization error control scheme to enhance the inference accuracy. Extensive numerical experiments demonstrate the effectiveness and superiority of our proposed optimization algorithm compared to various baselines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.06007v1-abstract-full').style.display = 'none'; document.getElementById('2404.06007v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper is accepted by IEEE Transactions on Communications on 08-Apr-2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.16397">arXiv:2403.16397</a> <span> [<a href="https://arxiv.org/pdf/2403.16397">pdf</a>, <a href="https://arxiv.org/format/2403.16397">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TWC.2024.3457157">10.1109/TWC.2024.3457157 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> RadioGAT: A Joint Model-based and Data-driven Framework for Multi-band Radiomap Reconstruction via Graph Attention Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+X">Xiaojie Li</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+S">Songyang Zhang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+H">Hang Li</a>, <a href="/search/eess?searchtype=author&query=Li%2C+X">Xiaoyang Li</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+L">Lexi Xu</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+H">Haigao Xu</a>, <a href="/search/eess?searchtype=author&query=Mei%2C+H">Hui Mei</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+G">Guangxu Zhu</a>, <a href="/search/eess?searchtype=author&query=Qi%2C+N">Nan Qi</a>, <a href="/search/eess?searchtype=author&query=Xiao%2C+M">Ming Xiao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.16397v2-abstract-short" style="display: inline;"> Multi-band radiomap reconstruction (MB-RMR) is a key component in wireless communications for tasks such as spectrum management and network planning. However, traditional machine-learning-based MB-RMR methods, which rely heavily on simulated data or complete structured ground truth, face significant deployment challenges. These challenges stem from the differences between simulated and actual data… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.16397v2-abstract-full').style.display = 'inline'; document.getElementById('2403.16397v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.16397v2-abstract-full" style="display: none;"> Multi-band radiomap reconstruction (MB-RMR) is a key component in wireless communications for tasks such as spectrum management and network planning. However, traditional machine-learning-based MB-RMR methods, which rely heavily on simulated data or complete structured ground truth, face significant deployment challenges. These challenges stem from the differences between simulated and actual data, as well as the scarcity of real-world measurements. To address these challenges, our study presents RadioGAT, a novel framework based on Graph Attention Network (GAT) tailored for MB-RMR within a single area, eliminating the need for multi-region datasets. RadioGAT innovatively merges model-based spatial-spectral correlation encoding with data-driven radiomap generalization, thus minimizing the reliance on extensive data sources. The framework begins by transforming sparse multi-band data into a graph structure through an innovative encoding strategy that leverages radio propagation models to capture the spatial-spectral correlation inherent in the data. This graph-based representation not only simplifies data handling but also enables tailored label sampling during training, significantly enhancing the framework's adaptability for deployment. Subsequently, The GAT is employed to generalize the radiomap information across various frequency bands. Extensive experiments using raytracing datasets based on real-world environments have demonstrated RadioGAT's enhanced accuracy in supervised learning settings and its robustness in semi-supervised scenarios. These results underscore RadioGAT's effectiveness and practicality for MB-RMR in environments with limited data availability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.16397v2-abstract-full').style.display = 'none'; document.getElementById('2403.16397v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">IEEE Transactions on Wireless Communications, early access, 2024</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> IEEE Transactions on Wireless Communications, vol. 23, no. 11, pp. 17777-17792, Nov. 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.15145">arXiv:2403.15145</a> <span> [<a href="https://arxiv.org/pdf/2403.15145">pdf</a>, <a href="https://arxiv.org/ps/2403.15145">ps</a>, <a href="https://arxiv.org/format/2403.15145">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Robust Resource Allocation for STAR-RIS Assisted SWIPT Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhu%2C+G">Guangyu Zhu</a>, <a href="/search/eess?searchtype=author&query=Mu%2C+X">Xidong Mu</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+L">Li Guo</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+A">Ao Huang</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+S">Shibiao Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.15145v1-abstract-short" style="display: inline;"> A simultaneously transmitting and reflecting reconfigurable intelligent surface (STAR-RIS) assisted simultaneous wireless information and power transfer (SWIPT) system is proposed. More particularly, an STAR-RIS is deployed to assist in the information/power transfer from a multi-antenna access point (AP) to multiple single-antenna information users (IUs) and energy users (EUs), where two practica… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.15145v1-abstract-full').style.display = 'inline'; document.getElementById('2403.15145v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.15145v1-abstract-full" style="display: none;"> A simultaneously transmitting and reflecting reconfigurable intelligent surface (STAR-RIS) assisted simultaneous wireless information and power transfer (SWIPT) system is proposed. More particularly, an STAR-RIS is deployed to assist in the information/power transfer from a multi-antenna access point (AP) to multiple single-antenna information users (IUs) and energy users (EUs), where two practical STAR-RIS operating protocols, namely energy splitting (ES) and time switching (TS), are employed. Under the imperfect channel state information (CSI) condition, a multi-objective optimization problem (MOOP) framework, that simultaneously maximizes the minimum data rate and minimum harvested power, is employed to investigate the fundamental rate-energy trade-off between IUs and EUs. To obtain the optimal robust resource allocation strategy, the MOOP is first transformed into a single-objective optimization problem (SOOP) via the 蔚-constraint method, which is then reformulated by approximating semi-infinite inequality constraints with the S-procedure. For ES, an alternating optimization (AO)-based algorithm is proposed to jointly design AP active beamforming and STAR-RIS passive beamforming, where a penalty method is leveraged in STAR-RIS beamforming design. Furthermore, the developed algorithm is extended to optimize the time allocation policy and beamforming vectors in a two-layer iterative manner for TS. Numerical results reveal that: 1) deploying STAR-RISs achieves a significant performance gain over conventional RISs, especially in terms of harvested power for EUs; 2) the ES protocol obtains a better user fairness performance when focusing only on IUs or EUs, while the TS protocol yields a better balance between IUs and EUs; 3) the imperfect CSI affects IUs more significantly than EUs, whereas TS can confer a more robust design to attenuate these effects. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.15145v1-abstract-full').style.display = 'none'; document.getElementById('2403.15145v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.12400">arXiv:2403.12400</a> <span> [<a href="https://arxiv.org/pdf/2403.12400">pdf</a>, <a href="https://arxiv.org/format/2403.12400">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/INFOCOMWKSHPS61880.2024.10620769">10.1109/INFOCOMWKSHPS61880.2024.10620769 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Finding the Missing Data: A BERT-inspired Approach Against Package Loss in Wireless Sensing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhao%2C+Z">Zijian Zhao</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+T">Tingwei Chen</a>, <a href="/search/eess?searchtype=author&query=Meng%2C+F">Fanyi Meng</a>, <a href="/search/eess?searchtype=author&query=Li%2C+H">Hang Li</a>, <a href="/search/eess?searchtype=author&query=Li%2C+X">Xiaoyang Li</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+G">Guangxu Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.12400v1-abstract-short" style="display: inline;"> Despite the development of various deep learning methods for Wi-Fi sensing, package loss often results in noncontinuous estimation of the Channel State Information (CSI), which negatively impacts the performance of the learning models. To overcome this challenge, we propose a deep learning model based on Bidirectional Encoder Representations from Transformers (BERT) for CSI recovery, named CSI-BER… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.12400v1-abstract-full').style.display = 'inline'; document.getElementById('2403.12400v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.12400v1-abstract-full" style="display: none;"> Despite the development of various deep learning methods for Wi-Fi sensing, package loss often results in noncontinuous estimation of the Channel State Information (CSI), which negatively impacts the performance of the learning models. To overcome this challenge, we propose a deep learning model based on Bidirectional Encoder Representations from Transformers (BERT) for CSI recovery, named CSI-BERT. CSI-BERT can be trained in an self-supervised manner on the target dataset without the need for additional data. Furthermore, unlike traditional interpolation methods that focus on one subcarrier at a time, CSI-BERT captures the sequential relationships across different subcarriers. Experimental results demonstrate that CSI-BERT achieves lower error rates and faster speed compared to traditional interpolation methods, even when facing with high loss rates. Moreover, by harnessing the recovered CSI obtained from CSI-BERT, other deep learning models like Residual Network and Recurrent Neural Network can achieve an average increase in accuracy of approximately 15\% in Wi-Fi sensing tasks. The collected dataset WiGesture and code for our model are publicly available at https://github.com/RS2002/CSI-BERT. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.12400v1-abstract-full').style.display = 'none'; document.getElementById('2403.12400v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">6 pages, accepted by IEEE INFOCOM Deepwireless Workshop 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.11693">arXiv:2403.11693</a> <span> [<a href="https://arxiv.org/pdf/2403.11693">pdf</a>, <a href="https://arxiv.org/format/2403.11693">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Beamforming Design for Semantic-Bit Coexisting Communication System </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+M">Maojun Zhang</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+G">Guangxu Zhu</a>, <a href="/search/eess?searchtype=author&query=Jin%2C+R">Richeng Jin</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+X">Xiaoming Chen</a>, <a href="/search/eess?searchtype=author&query=Shi%2C+Q">Qingjiang Shi</a>, <a href="/search/eess?searchtype=author&query=Zhong%2C+C">Caijun Zhong</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+K">Kaibin Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.11693v3-abstract-short" style="display: inline;"> Semantic communication (SemCom) is emerging as a key technology for future sixth-generation (6G) systems. Unlike traditional bit-level communication (BitCom), SemCom directly optimizes performance at the semantic level, leading to superior communication efficiency. Nevertheless, the task-oriented nature of SemCom renders it challenging to completely replace BitCom. Consequently, it is desired to c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.11693v3-abstract-full').style.display = 'inline'; document.getElementById('2403.11693v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.11693v3-abstract-full" style="display: none;"> Semantic communication (SemCom) is emerging as a key technology for future sixth-generation (6G) systems. Unlike traditional bit-level communication (BitCom), SemCom directly optimizes performance at the semantic level, leading to superior communication efficiency. Nevertheless, the task-oriented nature of SemCom renders it challenging to completely replace BitCom. Consequently, it is desired to consider a semantic-bit coexisting communication system, where a base station (BS) serves SemCom users (sem-users) and BitCom users (bit-users) simultaneously. Such a system faces severe and heterogeneous inter-user interference. In this context, this paper provides a new semantic-bit coexisting communication framework and proposes a spatial beamforming scheme to accommodate both types of users. Specifically, we consider maximizing the semantic rate for semantic users while ensuring the quality-of-service (QoS) requirements for bit-users. Due to the intractability of obtaining the exact closed-form expression of the semantic rate, a data driven method is first applied to attain an approximated expression via data fitting. With the resulting complex transcendental function, majorization minimization (MM) is adopted to convert the original formulated problem into a multiple-ratio problem, which allows fractional programming (FP) to be used to further transform the problem into an inhomogeneous quadratically constrained quadratic programs (QCQP) problem. Solving the problem leads to a semi-closed form solution with undetermined Lagrangian factors that can be updated by a fixed point algorithm. Extensive simulation results demonstrate that the proposed beamforming scheme significantly outperforms conventional beamforming algorithms such as zero-forcing (ZF), maximum ratio transmission (MRT), and weighted minimum mean-square error (WMMSE). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.11693v3-abstract-full').style.display = 'none'; document.getElementById('2403.11693v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to IEEE for possible publication</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.10493">arXiv:2403.10493</a> <span> [<a href="https://arxiv.org/pdf/2403.10493">pdf</a>, <a href="https://arxiv.org/format/2403.10493">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> MusicHiFi: Fast High-Fidelity Stereo Vocoding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhu%2C+G">Ge Zhu</a>, <a href="/search/eess?searchtype=author&query=Caceres%2C+J">Juan-Pablo Caceres</a>, <a href="/search/eess?searchtype=author&query=Duan%2C+Z">Zhiyao Duan</a>, <a href="/search/eess?searchtype=author&query=Bryan%2C+N+J">Nicholas J. Bryan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.10493v4-abstract-short" style="display: inline;"> Diffusion-based audio and music generation models commonly perform generation by constructing an image representation of audio (e.g., a mel-spectrogram) and then convert it to audio using a phase reconstruction model or vocoder. Typical vocoders, however, produce monophonic audio at lower resolutions (e.g., 16-24 kHz), which limits their usefulness. We propose MusicHiFi -- an efficient high-fideli… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.10493v4-abstract-full').style.display = 'inline'; document.getElementById('2403.10493v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.10493v4-abstract-full" style="display: none;"> Diffusion-based audio and music generation models commonly perform generation by constructing an image representation of audio (e.g., a mel-spectrogram) and then convert it to audio using a phase reconstruction model or vocoder. Typical vocoders, however, produce monophonic audio at lower resolutions (e.g., 16-24 kHz), which limits their usefulness. We propose MusicHiFi -- an efficient high-fidelity stereophonic vocoder. Our method employs a cascade of three generative adversarial networks (GANs) that convert low-resolution mel-spectrograms to audio, upsamples to high-resolution audio via bandwidth extension, and upmixes to stereophonic audio. Compared to past work, we propose 1) a unified GAN-based generator and discriminator architecture and training procedure for each stage of our cascade, 2) a new fast, near downsampling-compatible bandwidth extension module, and 3) a new fast downmix-compatible mono-to-stereo upmixer that ensures the preservation of monophonic content in the output. We evaluate our approach using objective and subjective listening tests and find our approach yields comparable or better audio quality, better spatialization control, and significantly faster inference speed compared to past work. Sound examples are at \url{https://MusicHiFi.github.io/web/}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.10493v4-abstract-full').style.display = 'none'; document.getElementById('2403.10493v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to IEEE Signal Processing Letters</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.01956">arXiv:2403.01956</a> <span> [<a href="https://arxiv.org/pdf/2403.01956">pdf</a>, <a href="https://arxiv.org/ps/2403.01956">ps</a>, <a href="https://arxiv.org/format/2403.01956">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Hybrid Active-Passive RIS Transmitter Enabled Energy-Efficient Multi-User Communications </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Huang%2C+A">Ao Huang</a>, <a href="/search/eess?searchtype=author&query=Mu%2C+X">Xidong Mu</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+L">Li Guo</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+G">Guangyu Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.01956v1-abstract-short" style="display: inline;"> A novel hybrid active-passive reconfigurable intelligent surface (RIS) transmitter enabled downlink multi-user communication system is investigated. Specifically, RISs are exploited to serve as transmitter antennas, where each element can flexibly switch between active and passive modes to deliver information to multiple users. The system energy efficiency (EE) maximization problem is formulated b… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.01956v1-abstract-full').style.display = 'inline'; document.getElementById('2403.01956v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.01956v1-abstract-full" style="display: none;"> A novel hybrid active-passive reconfigurable intelligent surface (RIS) transmitter enabled downlink multi-user communication system is investigated. Specifically, RISs are exploited to serve as transmitter antennas, where each element can flexibly switch between active and passive modes to deliver information to multiple users. The system energy efficiency (EE) maximization problem is formulated by jointly optimizing the RIS element scheduling and beamforming coefficients, as well as the power allocation coefficients, subject to the user's individual rate requirement and the maximum RIS amplification power constraint. Using the Dinkelbach relaxation, the original mixed-integer nonlinear programming problem is transformed into a nonfractional optimization problem with a two-layer structure, which is solved by the alternating optimization approach. In particular, an exhaustive search method is proposed to determine the optimal operating mode for each RIS element. Then, the RIS beamforming and power allocation coefficients are properly designed in an alternating manner. To overcome the potentially high complexity caused by exhaustive searching, we further develop a joint RIS element mode and beamforming optimization scheme by exploiting the Big-M formulation technique. Numerical results validate that: 1) The proposed hybrid RIS scheme yields higher EE than the baseline multi-antenna schemes employing fully active/passive RIS or conventional radio frequency chains; 2) Both proposed algorithms are effective in improving the system performance, especially the latter can achieve precise design of RIS elements with low complexity; and 3) For a fixed-size hybrid RIS, maximum EE can be reaped by setting only a minority of elements to operate in the active mode. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.01956v1-abstract-full').style.display = 'none'; document.getElementById('2403.01956v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.17780">arXiv:2402.17780</a> <span> [<a href="https://arxiv.org/pdf/2402.17780">pdf</a>, <a href="https://arxiv.org/format/2402.17780">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Medical Physics">physics.med-ph</span> </div> </div> <p class="title is-5 mathjax"> Constraint Latent Space Matters: An Anti-anomalous Waveform Transformation Solution from Photoplethysmography to Arterial Blood Pressure </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Bian%2C+C">Cheng Bian</a>, <a href="/search/eess?searchtype=author&query=Li%2C+X">Xiaoyu Li</a>, <a href="/search/eess?searchtype=author&query=Bi%2C+Q">Qi Bi</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+G">Guangpu Zhu</a>, <a href="/search/eess?searchtype=author&query=Lyu%2C+J">Jiegeng Lyu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+W">Weile Zhang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yelei Li</a>, <a href="/search/eess?searchtype=author&query=Zeng%2C+Z">Zijing Zeng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.17780v1-abstract-short" style="display: inline;"> Arterial blood pressure (ABP) holds substantial promise for proactive cardiovascular health management. Notwithstanding its potential, the invasive nature of ABP measurements confines their utility primarily to clinical environments, limiting their applicability for continuous monitoring beyond medical facilities. The conversion of photoplethysmography (PPG) signals into ABP equivalents has garner… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.17780v1-abstract-full').style.display = 'inline'; document.getElementById('2402.17780v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.17780v1-abstract-full" style="display: none;"> Arterial blood pressure (ABP) holds substantial promise for proactive cardiovascular health management. Notwithstanding its potential, the invasive nature of ABP measurements confines their utility primarily to clinical environments, limiting their applicability for continuous monitoring beyond medical facilities. The conversion of photoplethysmography (PPG) signals into ABP equivalents has garnered significant attention due to its potential in revolutionizing cardiovascular disease management. Recent strides in PPG-to-ABP prediction encompass the integration of generative and discriminative models. Despite these advances, the efficacy of these models is curtailed by the latent space shift predicament, stemming from alterations in PPG data distribution across disparate hardware and individuals, potentially leading to distorted ABP waveforms. To tackle this problem, we present an innovative solution named the Latent Space Constraint Transformer (LSCT), leveraging a quantized codebook to yield robust latent spaces by employing multiple discretizing bases. To facilitate improved reconstruction, the Correlation-boosted Attention Module (CAM) is introduced to systematically query pertinent bases on a global scale. Furthermore, to enhance expressive capacity, we propose the Multi-Spectrum Enhancement Knowledge (MSEK), which fosters local information flow within the channels of latent code and provides additional embedding for reconstruction. Through comprehensive experimentation on both publicly available datasets and a private downstream task dataset, the proposed approach demonstrates noteworthy performance enhancements compared to existing methods. Extensive ablation studies further substantiate the effectiveness of each introduced module. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.17780v1-abstract-full').style.display = 'none'; document.getElementById('2402.17780v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by AAAI-2024, main track</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.06986">arXiv:2402.06986</a> <span> [<a href="https://arxiv.org/pdf/2402.06986">pdf</a>, <a href="https://arxiv.org/format/2402.06986">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Cacophony: An Improved Contrastive Audio-Text Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhu%2C+G">Ge Zhu</a>, <a href="/search/eess?searchtype=author&query=Darefsky%2C+J">Jordan Darefsky</a>, <a href="/search/eess?searchtype=author&query=Duan%2C+Z">Zhiyao Duan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.06986v3-abstract-short" style="display: inline;"> Despite recent advancements, audio-text models still lag behind their image-text counterparts in scale and performance. In this paper, we propose to improve both the data scale and the training procedure of audio-text contrastive models. Specifically, we craft a large-scale audio-text dataset containing 13,000 hours of text-labeled audio, using pretrained language models to process noisy text desc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.06986v3-abstract-full').style.display = 'inline'; document.getElementById('2402.06986v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.06986v3-abstract-full" style="display: none;"> Despite recent advancements, audio-text models still lag behind their image-text counterparts in scale and performance. In this paper, we propose to improve both the data scale and the training procedure of audio-text contrastive models. Specifically, we craft a large-scale audio-text dataset containing 13,000 hours of text-labeled audio, using pretrained language models to process noisy text descriptions and automatic captioning to obtain text descriptions for unlabeled audio samples. We first train on audio-only data with a masked autoencoder (MAE) objective, which allows us to benefit from the scalability of unlabeled audio datasets. We then train a contrastive model with an auxiliary captioning objective with the audio encoder initialized from the MAE model. Our final model, which we name Cacophony, achieves state-of-the-art performance on audio-text retrieval tasks, and exhibits competitive results on the HEAR benchmark and other downstream tasks such as zero-shot classification. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.06986v3-abstract-full').style.display = 'none'; document.getElementById('2402.06986v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at IEEE/ACM Transactions on Audio, Speech, and Language Processing</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.02729">arXiv:2402.02729</a> <span> [<a href="https://arxiv.org/pdf/2402.02729">pdf</a>, <a href="https://arxiv.org/ps/2402.02729">ps</a>, <a href="https://arxiv.org/format/2402.02729">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Fast and Accurate Cooperative Radio Map Estimation Enabled by GAN </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+Z">Zezhong Zhang</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+G">Guangxu Zhu</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+J">Junting Chen</a>, <a href="/search/eess?searchtype=author&query=Cui%2C+S">Shuguang Cui</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.02729v1-abstract-short" style="display: inline;"> In the 6G era, real-time radio resource monitoring and management are urged to support diverse wireless-empowered applications. This calls for fast and accurate estimation on the distribution of the radio resources, which is usually represented by the spatial signal power strength over the geographical environment, known as a radio map. In this paper, we present a cooperative radio map estimation… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.02729v1-abstract-full').style.display = 'inline'; document.getElementById('2402.02729v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.02729v1-abstract-full" style="display: none;"> In the 6G era, real-time radio resource monitoring and management are urged to support diverse wireless-empowered applications. This calls for fast and accurate estimation on the distribution of the radio resources, which is usually represented by the spatial signal power strength over the geographical environment, known as a radio map. In this paper, we present a cooperative radio map estimation (CRME) approach enabled by the generative adversarial network (GAN), called as GAN-CRME, which features fast and accurate radio map estimation without the transmitters' information. The radio map is inferred by exploiting the interaction between distributed received signal strength (RSS) measurements at mobile users and the geographical map using a deep neural network estimator, resulting in low data-acquisition cost and computational complexity. Moreover, a GAN-based learning algorithm is proposed to boost the inference capability of the deep neural network estimator by exploiting the power of generative AI. Simulation results showcase that the proposed GAN-CRME is even capable of coarse error-correction when the geographical map information is inaccurate. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.02729v1-abstract-full').style.display = 'none'; document.getElementById('2402.02729v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.16192">arXiv:2311.16192</a> <span> [<a href="https://arxiv.org/pdf/2311.16192">pdf</a>, <a href="https://arxiv.org/format/2311.16192">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Utilizing Multiple Inputs Autoregressive Models for Bearing Remaining Useful Life Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+J">Junliang Wang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Q">Qinghua Zhang</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+G">Guanhua Zhu</a>, <a href="/search/eess?searchtype=author&query=Sun%2C+G">Guoxi Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.16192v1-abstract-short" style="display: inline;"> Accurate prediction of the Remaining Useful Life (RUL) of rolling bearings is crucial in industrial production, yet existing models often struggle with limited generalization capabilities due to their inability to fully process all vibration signal patterns. We introduce a novel multi-input autoregressive model to address this challenge in RUL prediction for bearings. Our approach uniquely integra… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.16192v1-abstract-full').style.display = 'inline'; document.getElementById('2311.16192v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.16192v1-abstract-full" style="display: none;"> Accurate prediction of the Remaining Useful Life (RUL) of rolling bearings is crucial in industrial production, yet existing models often struggle with limited generalization capabilities due to their inability to fully process all vibration signal patterns. We introduce a novel multi-input autoregressive model to address this challenge in RUL prediction for bearings. Our approach uniquely integrates vibration signals with previously predicted Health Indicator (HI) values, employing feature fusion to output current window HI values. Through autoregressive iterations, the model attains a global receptive field, effectively overcoming the limitations in generalization. Furthermore, we innovatively incorporate a segmentation method and multiple training iterations to mitigate error accumulation in autoregressive models. Empirical evaluation on the PMH2012 dataset demonstrates that our model, compared to other backbone networks using similar autoregressive approaches, achieves significantly lower Root Mean Square Error (RMSE) and Score. Notably, it outperforms traditional autoregressive models that use label values as inputs and non-autoregressive networks, showing superior generalization abilities with a marked lead in RMSE and Score metrics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.16192v1-abstract-full').style.display = 'none'; document.getElementById('2311.16192v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.10525">arXiv:2311.10525</a> <span> [<a href="https://arxiv.org/pdf/2311.10525">pdf</a>, <a href="https://arxiv.org/format/2311.10525">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Utilizing VQ-VAE for End-to-End Health Indicator Generation in Predicting Rolling Bearing RUL </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+J">Junliang Wang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Q">Qinghua Zhang</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+G">Guanhua Zhu</a>, <a href="/search/eess?searchtype=author&query=Sun%2C+G">Guoxi Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.10525v1-abstract-short" style="display: inline;"> The prediction of the remaining useful life (RUL) of rolling bearings is a pivotal issue in industrial production. A crucial approach to tackling this issue involves transforming vibration signals into health indicators (HI) to aid model training. This paper presents an end-to-end HI construction method, vector quantised variational autoencoder (VQ-VAE), which addresses the need for dimensionality… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.10525v1-abstract-full').style.display = 'inline'; document.getElementById('2311.10525v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.10525v1-abstract-full" style="display: none;"> The prediction of the remaining useful life (RUL) of rolling bearings is a pivotal issue in industrial production. A crucial approach to tackling this issue involves transforming vibration signals into health indicators (HI) to aid model training. This paper presents an end-to-end HI construction method, vector quantised variational autoencoder (VQ-VAE), which addresses the need for dimensionality reduction of latent variables in traditional unsupervised learning methods such as autoencoder. Moreover, concerning the inadequacy of traditional statistical metrics in reflecting curve fluctuations accurately, two novel statistical metrics, mean absolute distance (MAD) and mean variance (MV), are introduced. These metrics accurately depict the fluctuation patterns in the curves, thereby indicating the model's accuracy in discerning similar features. On the PMH2012 dataset, methods employing VQ-VAE for label construction achieved lower values for MAD and MV. Furthermore, the ASTCN prediction model trained with VQ-VAE labels demonstrated commendable performance, attaining the lowest values for MAD and MV. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.10525v1-abstract-full').style.display = 'none'; document.getElementById('2311.10525v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.09028">arXiv:2311.09028</a> <span> [<a href="https://arxiv.org/pdf/2311.09028">pdf</a>, <a href="https://arxiv.org/format/2311.09028">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Integrating Sensing, Communication, and Power Transfer: Multiuser Beamforming Design </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhou%2C+Z">Ziqin Zhou</a>, <a href="/search/eess?searchtype=author&query=Li%2C+X">Xiaoyang Li</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+G">Guangxu Zhu</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+J">Jie Xu</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+K">Kaibin Huang</a>, <a href="/search/eess?searchtype=author&query=Cui%2C+S">Shuguang Cui</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.09028v1-abstract-short" style="display: inline;"> In the sixth-generation (6G) networks, massive low-power devices are expected to sense environment and deliver tremendous data. To enhance the radio resource efficiency, the integrated sensing and communication (ISAC) technique exploits the sensing and communication functionalities of signals, while the simultaneous wireless information and power transfer (SWIPT) techniques utilizes the same signa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.09028v1-abstract-full').style.display = 'inline'; document.getElementById('2311.09028v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.09028v1-abstract-full" style="display: none;"> In the sixth-generation (6G) networks, massive low-power devices are expected to sense environment and deliver tremendous data. To enhance the radio resource efficiency, the integrated sensing and communication (ISAC) technique exploits the sensing and communication functionalities of signals, while the simultaneous wireless information and power transfer (SWIPT) techniques utilizes the same signals as the carriers for both information and power delivery. The further combination of ISAC and SWIPT leads to the advanced technology namely integrated sensing, communication, and power transfer (ISCPT). In this paper, a multi-user multiple-input multiple-output (MIMO) ISCPT system is considered, where a base station equipped with multiple antennas transmits messages to multiple information receivers (IRs), transfers power to multiple energy receivers (ERs), and senses a target simultaneously. The sensing target can be regarded as a point or an extended surface. When the locations of IRs and ERs are separated, the MIMO beamforming designs are optimized to improve the sensing performance while meeting the communication and power transfer requirements. The resultant non-convex optimization problems are solved based on a series of techniques including Schur complement transformation and rank reduction. Moreover, when the IRs and ERs are co-located, the power splitting factors are jointly optimized together with the beamformers to balance the performance of communication and power transfer. To better understand the performance of ISCPT, the target positioning problem is further investigated. Simulations are conducted to verify the effectiveness of our proposed designs, which also reveal a performance tradeoff among sensing, communication, and power transfer. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.09028v1-abstract-full').style.display = 'none'; document.getElementById('2311.09028v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper has been submitted to IEEE for possible publication</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.08667">arXiv:2311.08667</a> <span> [<a href="https://arxiv.org/pdf/2311.08667">pdf</a>, <a href="https://arxiv.org/format/2311.08667">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> EDMSound: Spectrogram Based Diffusion Models for Efficient and High-Quality Audio Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhu%2C+G">Ge Zhu</a>, <a href="/search/eess?searchtype=author&query=Wen%2C+Y">Yutong Wen</a>, <a href="/search/eess?searchtype=author&query=Carbonneau%2C+M">Marc-Andr茅 Carbonneau</a>, <a href="/search/eess?searchtype=author&query=Duan%2C+Z">Zhiyao Duan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.08667v2-abstract-short" style="display: inline;"> Audio diffusion models can synthesize a wide variety of sounds. Existing models often operate on the latent domain with cascaded phase recovery modules to reconstruct waveform. This poses challenges when generating high-fidelity audio. In this paper, we propose EDMSound, a diffusion-based generative model in spectrogram domain under the framework of elucidated diffusion models (EDM). Combining wit… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.08667v2-abstract-full').style.display = 'inline'; document.getElementById('2311.08667v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.08667v2-abstract-full" style="display: none;"> Audio diffusion models can synthesize a wide variety of sounds. Existing models often operate on the latent domain with cascaded phase recovery modules to reconstruct waveform. This poses challenges when generating high-fidelity audio. In this paper, we propose EDMSound, a diffusion-based generative model in spectrogram domain under the framework of elucidated diffusion models (EDM). Combining with efficient deterministic sampler, we achieved similar Fr茅chet audio distance (FAD) score as top-ranked baseline with only 10 steps and reached state-of-the-art performance with 50 steps on the DCASE2023 foley sound generation benchmark. We also revealed a potential concern regarding diffusion based audio generation models that they tend to generate samples with high perceptual similarity to the data from training data. Project page: https://agentcooper2002.github.io/EDMSound/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.08667v2-abstract-full').style.display = 'none'; document.getElementById('2311.08667v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at NeurIPS Workshop: Machine Learning for Audio (Camera Ready)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.07906">arXiv:2310.07906</a> <span> [<a href="https://arxiv.org/pdf/2310.07906">pdf</a>, <a href="https://arxiv.org/ps/2310.07906">ps</a>, <a href="https://arxiv.org/format/2310.07906">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Analysis of PDEs">math.AP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/ACCESS.2024.3392511">10.1109/ACCESS.2024.3392511 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Power Tracking Control of Heterogeneous Populations of TCLs with Partially Measured States </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+Z">Zhenhe Zhang</a>, <a href="/search/eess?searchtype=author&query=Zheng%2C+J">Jun Zheng</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+G">Guchuan Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.07906v1-abstract-short" style="display: inline;"> This paper presents a new aggregate power tracking control scheme for populations of thermostatically controlled loads (TCLs). The control design is performed in the framework of partial differential equations (PDEs) based on a late-lumping procedure without truncating the infinite-dimensional model describing the dynamics of the TCL population. An input-output linearization control scheme, which… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.07906v1-abstract-full').style.display = 'inline'; document.getElementById('2310.07906v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.07906v1-abstract-full" style="display: none;"> This paper presents a new aggregate power tracking control scheme for populations of thermostatically controlled loads (TCLs). The control design is performed in the framework of partial differential equations (PDEs) based on a late-lumping procedure without truncating the infinite-dimensional model describing the dynamics of the TCL population. An input-output linearization control scheme, which is independent of system parameters and uses only partial state measurement, is derived, and a sliding model-like control is applied to achieve finite-time input-to-state stability for tracking error dynamics. Such a control strategy can ensure robust performance in the presence of modeling uncertainties, while considerably reducing the communication burden in large scale distributed systems similar to that considered in the present work. A rigorous analysis of the closed-loop stability of the underlying PDE system was conducted, which guaranteed the validity of the developed control scheme. Simulation studies were performed while considering two TCL populations with a significant difference in their size, and the results show that the developed control scheme performs well in both cases, thereby confirming the effectiveness of the proposed solution. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.07906v1-abstract-full').style.display = 'none'; document.getElementById('2310.07906v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">arXiv admin note: substantial text overlap with arXiv:2305.09118</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> IEEE Access, 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2307.16117">arXiv:2307.16117</a> <span> [<a href="https://arxiv.org/pdf/2307.16117">pdf</a>, <a href="https://arxiv.org/format/2307.16117">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/GCWkshps58843.2023.10464790">10.1109/GCWkshps58843.2023.10464790 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Successive Pose Estimation and Beam Tracking for mmWave Vehicular Communication Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Liu%2C+C">Cen Liu</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+G">Guangxu Zhu</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+F">Fan Liu</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+Y">Yuanwei Liu</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+K">Kaibin Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2307.16117v2-abstract-short" style="display: inline;"> The millimeter wave (mmWave) radar sensing-aided communications in vehicular mobile communication systems is investigated. To alleviate the beam training overhead under high mobility scenarios, a successive pose estimation and beam tracking (SPEBT) scheme is proposed to facilitate mmWave communications with the assistance of mmWave radar sensing. The proposed SPEBT scheme first resorts to a Fast C… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.16117v2-abstract-full').style.display = 'inline'; document.getElementById('2307.16117v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2307.16117v2-abstract-full" style="display: none;"> The millimeter wave (mmWave) radar sensing-aided communications in vehicular mobile communication systems is investigated. To alleviate the beam training overhead under high mobility scenarios, a successive pose estimation and beam tracking (SPEBT) scheme is proposed to facilitate mmWave communications with the assistance of mmWave radar sensing. The proposed SPEBT scheme first resorts to a Fast Conservative Filtering for Efficient and Accurate Radar odometry (Fast-CFEAR) approach to estimate the vehicle pose consisting of 2-dimensional position and yaw from radar point clouds collected by mmWave radar sensor. Then, the pose estimation information is fed into an extend Kalman filter to perform beam tracking for the line-of-sight channel. Owing to the intrinsic robustness of mmWave radar sensing, the proposed SPEBT scheme is capable of operating reliably under extreme weather/illumination conditions and large-scale global navigation satellite systems (GNSS)-denied environments. The practical deployment of the SPEBT scheme is verified through rigorous testing on a real-world sensing dataset. Simulation results demonstrate that the proposed SPEBT scheme is capable of providing precise pose estimation information and accurate beam tracking output, while reducing the proportion of beam training overhead to less than 5% averagely. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.16117v2-abstract-full').style.display = 'none'; document.getElementById('2307.16117v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">An extended version of a conference submission. 7 pages, 5 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> IEEE Global Communications Conference Workshops (GC Wkshps) 2023 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.12298">arXiv:2306.12298</a> <span> [<a href="https://arxiv.org/pdf/2306.12298">pdf</a>, <a href="https://arxiv.org/format/2306.12298">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> StarVQA+: Co-training Space-Time Attention for Video Quality Assessment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Xing%2C+F">Fengchuang Xing</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yuan-Gen Wang</a>, <a href="/search/eess?searchtype=author&query=Tang%2C+W">Weixuan Tang</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+G">Guopu Zhu</a>, <a href="/search/eess?searchtype=author&query=Kwong%2C+S">Sam Kwong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.12298v1-abstract-short" style="display: inline;"> Self-attention based Transformer has achieved great success in many computer vision tasks. However, its application to video quality assessment (VQA) has not been satisfactory so far. Evaluating the quality of in-the-wild videos is challenging due to the unknown of pristine reference and shooting distortion. This paper presents a co-trained Space-Time Attention network for the VQA problem, termed… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.12298v1-abstract-full').style.display = 'inline'; document.getElementById('2306.12298v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.12298v1-abstract-full" style="display: none;"> Self-attention based Transformer has achieved great success in many computer vision tasks. However, its application to video quality assessment (VQA) has not been satisfactory so far. Evaluating the quality of in-the-wild videos is challenging due to the unknown of pristine reference and shooting distortion. This paper presents a co-trained Space-Time Attention network for the VQA problem, termed StarVQA+. Specifically, we first build StarVQA+ by alternately concatenating the divided space-time attention. Then, to facilitate the training of StarVQA+, we design a vectorized regression loss by encoding the mean opinion score (MOS) to the probability vector and embedding a special token as the learnable variable of MOS, leading to better fitting of human's rating process. Finally, to solve the data hungry problem with Transformer, we propose to co-train the spatial and temporal attention weights using both images and videos. Various experiments are conducted on the de-facto in-the-wild video datasets, including LIVE-Qualcomm, LIVE-VQC, KoNViD-1k, YouTube-UGC, LSVQ, LSVQ-1080p, and DVL2021. Experimental results demonstrate the superiority of the proposed StarVQA+ over the state-of-the-art. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.12298v1-abstract-full').style.display = 'none'; document.getElementById('2306.12298v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.06603">arXiv:2306.06603</a> <span> [<a href="https://arxiv.org/pdf/2306.06603">pdf</a>, <a href="https://arxiv.org/ps/2306.06603">ps</a>, <a href="https://arxiv.org/format/2306.06603">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Task-Oriented Integrated Sensing, Computation and Communication for Wireless Edge AI </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Xing%2C+H">Hong Xing</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+G">Guangxu Zhu</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+D">Dongzhu Liu</a>, <a href="/search/eess?searchtype=author&query=Wen%2C+H">Haifeng Wen</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+K">Kaibin Huang</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+K">Kaishun Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.06603v1-abstract-short" style="display: inline;"> With the advent of emerging IoT applications such as autonomous driving, digital-twin and metaverse etc. featuring massive data sensing, analyzing and inference as well critical latency in beyond 5G (B5G) networks, edge artificial intelligence (AI) has been proposed to provide high-performance computation of a conventional cloud down to the network edge. Recently, convergence of wireless sensing,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.06603v1-abstract-full').style.display = 'inline'; document.getElementById('2306.06603v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.06603v1-abstract-full" style="display: none;"> With the advent of emerging IoT applications such as autonomous driving, digital-twin and metaverse etc. featuring massive data sensing, analyzing and inference as well critical latency in beyond 5G (B5G) networks, edge artificial intelligence (AI) has been proposed to provide high-performance computation of a conventional cloud down to the network edge. Recently, convergence of wireless sensing, computation and communication (SC${}^2$) for specific edge AI tasks, has aroused paradigm shift by enabling (partial) sharing of the radio-frequency (RF) transceivers and information processing pipelines among these three fundamental functionalities of IoT. However, most existing design frameworks separate these designs incurring unnecessary signaling overhead and waste of energy, and it is therefore of paramount importance to advance fully integrated sensing, computation and communication (ISCC) to achieve ultra-reliable and low-latency edge intelligence acquisition. In this article, we provide an overview of principles of enabling ISCC technologies followed by two concrete use cases of edge AI tasks demonstrating the advantage of task-oriented ISCC, and pointed out some practical challenges in edge AI design with advanced ISCC solutions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.06603v1-abstract-full').style.display = 'none'; document.getElementById('2306.06603v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18 pages, 6 figures, submitted for possible journal publication</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.02990">arXiv:2306.02990</a> <span> [<a href="https://arxiv.org/pdf/2306.02990">pdf</a>, <a href="https://arxiv.org/format/2306.02990">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Integrated Sensing, Computation, and Communication for UAV-assisted Federated Edge Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Tang%2C+Y">Yao Tang</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+G">Guangxu Zhu</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+W">Wei Xu</a>, <a href="/search/eess?searchtype=author&query=Cheung%2C+M+H">Man Hon Cheung</a>, <a href="/search/eess?searchtype=author&query=Lok%2C+T">Tat-Ming Lok</a>, <a href="/search/eess?searchtype=author&query=Cui%2C+S">Shuguang Cui</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.02990v1-abstract-short" style="display: inline;"> Federated edge learning (FEEL) enables privacy-preserving model training through periodic communication between edge devices and the server. Unmanned Aerial Vehicle (UAV)-mounted edge devices are particularly advantageous for FEEL due to their flexibility and mobility in efficient data collection. In UAV-assisted FEEL, sensing, computation, and communication are coupled and compete for limited onb… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.02990v1-abstract-full').style.display = 'inline'; document.getElementById('2306.02990v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.02990v1-abstract-full" style="display: none;"> Federated edge learning (FEEL) enables privacy-preserving model training through periodic communication between edge devices and the server. Unmanned Aerial Vehicle (UAV)-mounted edge devices are particularly advantageous for FEEL due to their flexibility and mobility in efficient data collection. In UAV-assisted FEEL, sensing, computation, and communication are coupled and compete for limited onboard resources, and UAV deployment also affects sensing and communication performance. Therefore, the joint design of UAV deployment and resource allocation is crucial to achieving the optimal training performance. In this paper, we address the problem of joint UAV deployment design and resource allocation for FEEL via a concrete case study of human motion recognition based on wireless sensing. We first analyze the impact of UAV deployment on the sensing quality and identify a threshold value for the sensing elevation angle that guarantees a satisfactory quality of data samples. Due to the non-ideal sensing channels, we consider the probabilistic sensing model, where the successful sensing probability of each UAV is determined by its position. Then, we derive the upper bound of the FEEL training loss as a function of the sensing probability. Theoretical results suggest that the convergence rate can be improved if UAVs have a uniform successful sensing probability. Based on this analysis, we formulate a training time minimization problem by jointly optimizing UAV deployment, integrated sensing, computation, and communication (ISCC) resources under a desirable optimality gap constraint. To solve this challenging mixed-integer non-convex problem, we apply the alternating optimization technique, and propose the bandwidth, batch size, and position optimization (BBPO) scheme to optimize these three decision variables alternately. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.02990v1-abstract-full').style.display = 'none'; document.getElementById('2306.02990v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.04152">arXiv:2305.04152</a> <span> [<a href="https://arxiv.org/pdf/2305.04152">pdf</a>, <a href="https://arxiv.org/ps/2305.04152">ps</a>, <a href="https://arxiv.org/format/2305.04152">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Bayesian Over-the-Air FedAvg via Channel Driven Stochastic Gradient Langevin Dynamics </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+B">Boning Zhang</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+D">Dongzhu Liu</a>, <a href="/search/eess?searchtype=author&query=Simeone%2C+O">Osvaldo Simeone</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+G">Guangxu Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.04152v2-abstract-short" style="display: inline;"> The recent development of scalable Bayesian inference methods has renewed interest in the adoption of Bayesian learning as an alternative to conventional frequentist learning that offers improved model calibration via uncertainty quantification. Recently, federated averaging Langevin dynamics (FALD) was introduced as a variant of federated averaging that can efficiently implement distributed Bayes… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.04152v2-abstract-full').style.display = 'inline'; document.getElementById('2305.04152v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.04152v2-abstract-full" style="display: none;"> The recent development of scalable Bayesian inference methods has renewed interest in the adoption of Bayesian learning as an alternative to conventional frequentist learning that offers improved model calibration via uncertainty quantification. Recently, federated averaging Langevin dynamics (FALD) was introduced as a variant of federated averaging that can efficiently implement distributed Bayesian learning in the presence of noiseless communications. In this paper, we propose wireless FALD (WFALD), a novel protocol that realizes FALD in wireless systems by integrating over-the-air computation and channel-driven sampling for Monte Carlo updates. Unlike prior work on wireless Bayesian learning, WFALD enables (\emph{i}) multiple local updates between communication rounds; and (\emph{ii}) stochastic gradients computed by mini-batch. A convergence analysis is presented in terms of the 2-Wasserstein distance between the samples produced by WFALD and the targeted global posterior distribution. Analysis and experiments show that, when the signal-to-noise ratio is sufficiently large, channel noise can be fully repurposed for Monte Carlo sampling, thus entailing no loss in performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.04152v2-abstract-full').style.display = 'none'; document.getElementById('2305.04152v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">6 pages, 4 figures, 26 references, submitted</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2303.16404">arXiv:2303.16404</a> <span> [<a href="https://arxiv.org/pdf/2303.16404">pdf</a>, <a href="https://arxiv.org/format/2303.16404">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Robust Andrew's sine estimate adaptive filtering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Lu%2C+L">Lu Lu</a>, <a href="/search/eess?searchtype=author&query=Yu%2C+Y">Yi Yu</a>, <a href="/search/eess?searchtype=author&query=Zheng%2C+Z">Zongsheng Zheng</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+G">Guangya Zhu</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+X">Xiaomin Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2303.16404v1-abstract-short" style="display: inline;"> The Andrew's sine function is a robust estimator, which has been used in outlier rejection and robust statistics. However, the performance of such estimator does not receive attention in the field of adaptive filtering techniques. Two Andrew's sine estimator (ASE)-based robust adaptive filtering algorithms are proposed in this brief. Specifically, to achieve improved performance and reduced comput… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.16404v1-abstract-full').style.display = 'inline'; document.getElementById('2303.16404v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2303.16404v1-abstract-full" style="display: none;"> The Andrew's sine function is a robust estimator, which has been used in outlier rejection and robust statistics. However, the performance of such estimator does not receive attention in the field of adaptive filtering techniques. Two Andrew's sine estimator (ASE)-based robust adaptive filtering algorithms are proposed in this brief. Specifically, to achieve improved performance and reduced computational complexity, the iterative Wiener filter (IWF) is an attractive choice. A novel IWF based on ASE (IWF-ASE) is proposed for impulsive noises. To further reduce the computational complexity, the leading dichotomous coordinate descent (DCD) algorithm is combined with the ASE, developing DCD-ASE algorithm. Simulations on system identification demonstrate that the proposed algorithms can achieve smaller misalignment as compared to the conventional IWF, recursive maximum correntropy criterion (RMCC), and DCD-RMCC algorithms in impulsive noise. Furthermore, the proposed algorithms exhibit improved performance in partial discharge (PD) denoising. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.16404v1-abstract-full').style.display = 'none'; document.getElementById('2303.16404v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2303.11319">arXiv:2303.11319</a> <span> [<a href="https://arxiv.org/pdf/2303.11319">pdf</a>, <a href="https://arxiv.org/format/2303.11319">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Over-the-Air Federated Edge Learning with Error-Feedback One-Bit Quantization and Power Control </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Liu%2C+Y">Yuding Liu</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+D">Dongzhu Liu</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+G">Guangxu Zhu</a>, <a href="/search/eess?searchtype=author&query=Shi%2C+Q">Qingjiang Shi</a>, <a href="/search/eess?searchtype=author&query=Zhong%2C+C">Caijun Zhong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2303.11319v1-abstract-short" style="display: inline;"> Over-the-air federated edge learning (Air-FEEL) is a communication-efficient framework for distributed machine learning using training data distributed at edge devices. This framework enables all edge devices to transmit model updates simultaneously over the entire available bandwidth, allowing for over-the-air aggregation. A one-bit digital over-the-air aggregation (OBDA) scheme has been recently… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.11319v1-abstract-full').style.display = 'inline'; document.getElementById('2303.11319v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2303.11319v1-abstract-full" style="display: none;"> Over-the-air federated edge learning (Air-FEEL) is a communication-efficient framework for distributed machine learning using training data distributed at edge devices. This framework enables all edge devices to transmit model updates simultaneously over the entire available bandwidth, allowing for over-the-air aggregation. A one-bit digital over-the-air aggregation (OBDA) scheme has been recently proposed, featuring one-bit gradient quantization at edge devices and majority-voting based decoding at the edge server. However, the low-resolution one-bit gradient quantization slows down the model convergence and leads to performance degradation. On the other hand, the aggregation errors caused by fading channels in Air-FEEL is still remained to be solved. To address these issues, we propose the error-feedback one-bit broadband digital aggregation (EFOBDA) and an optimized power control policy. To this end, we first provide a theoretical analysis to evaluate the impact of error feedback on the convergence of FL with EFOBDA. The analytical results show that, by setting an appropriate feedback strength, EFOBDA is comparable to the Air-FEEL without quantization, thus enhancing the performance of OBDA. Then, we further introduce a power control policy by maximizing the convergence rate under instantaneous power constraints. The convergence analysis and optimized power control policy are verified by the experiments, which show that the proposed scheme achieves significantly faster convergence and higher test accuracy in image classification tasks compared with the one-bit quantization scheme without error feedback or optimized power control policy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.11319v1-abstract-full').style.display = 'none'; document.getElementById('2303.11319v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2303.06475">arXiv:2303.06475</a> <span> [<a href="https://arxiv.org/pdf/2303.06475">pdf</a>, <a href="https://arxiv.org/format/2303.06475">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Transcription free filler word detection with Neural semi-CRFs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhu%2C+G">Ge Zhu</a>, <a href="/search/eess?searchtype=author&query=Yan%2C+Y">Yujia Yan</a>, <a href="/search/eess?searchtype=author&query=Caceres%2C+J">Juan-Pablo Caceres</a>, <a href="/search/eess?searchtype=author&query=Duan%2C+Z">Zhiyao Duan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2303.06475v1-abstract-short" style="display: inline;"> Non-linguistic filler words, such as "uh" or "um", are prevalent in spontaneous speech and serve as indicators for expressing hesitation or uncertainty. Previous works for detecting certain non-linguistic filler words are highly dependent on transcriptions from a well-established commercial automatic speech recognition (ASR) system. However, certain ASR systems are not universally accessible from… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.06475v1-abstract-full').style.display = 'inline'; document.getElementById('2303.06475v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2303.06475v1-abstract-full" style="display: none;"> Non-linguistic filler words, such as "uh" or "um", are prevalent in spontaneous speech and serve as indicators for expressing hesitation or uncertainty. Previous works for detecting certain non-linguistic filler words are highly dependent on transcriptions from a well-established commercial automatic speech recognition (ASR) system. However, certain ASR systems are not universally accessible from many aspects, e.g., budget, target languages, and computational power. In this work, we investigate filler word detection system that does not depend on ASR systems. We show that, by using the structured state space sequence model (S4) and neural semi-Markov conditional random fields (semi-CRFs), we achieve an absolute F1 improvement of 6.4% (segment level) and 3.1% (event level) on the PodcastFillers dataset. We also conduct a qualitative analysis on the detected results to analyze the limitations of our proposed system. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.06475v1-abstract-full').style.display = 'none'; document.getElementById('2303.06475v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICASSP 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2212.00227">arXiv:2212.00227</a> <span> [<a href="https://arxiv.org/pdf/2212.00227">pdf</a>, <a href="https://arxiv.org/format/2212.00227">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Wireless Image Transmission with Semantic and Security Awareness </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+M">Maojun Zhang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yang Li</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Z">Zezhong Zhang</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+G">Guangxu Zhu</a>, <a href="/search/eess?searchtype=author&query=Zhong%2C+C">Caijun Zhong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2212.00227v1-abstract-short" style="display: inline;"> Semantic communication is an increasingly popular framework for wireless image transmission due to its high communication efficiency. With the aid of the joint-source-and-channel (JSC) encoder implemented by neural network, semantic communication directly maps original images into symbol sequences containing semantic information. Compared with the traditional separate source and channel coding des… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.00227v1-abstract-full').style.display = 'inline'; document.getElementById('2212.00227v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2212.00227v1-abstract-full" style="display: none;"> Semantic communication is an increasingly popular framework for wireless image transmission due to its high communication efficiency. With the aid of the joint-source-and-channel (JSC) encoder implemented by neural network, semantic communication directly maps original images into symbol sequences containing semantic information. Compared with the traditional separate source and channel coding design used in bitlevel communication systems, semantic communication systems are known to be more efficient and accurate especially in the low signal-to-the-noise ratio (SNR) regime. This thus prompts an critical while yet to be tackled issue of security in semantic communication: it makes the eavesdropper more easier to crack the semantic information as it can be decoded even in a quite noisy channel. In this letter, we develop a semantic communication framework that accounts for both semantic meaning decoding efficiency and its risk of privacy leakage. To achieve this, targeting wireless image transmission, we on the one hand propose an JSC autoencoder featuring residual for efficient semantic meaning extraction and transmission, and on the other hand, propose a data-driven scheme that balances the efficiency-privacy tradeoff. Extensive experimental results are provided to show the effectiveness and robustness of the proposed scheme. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.00227v1-abstract-full').style.display = 'none'; document.getElementById('2212.00227v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to IEEE WCL for possible publication</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2211.01255">arXiv:2211.01255</a> <span> [<a href="https://arxiv.org/pdf/2211.01255">pdf</a>, <a href="https://arxiv.org/format/2211.01255">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Task-Oriented Over-the-Air Computation for Multi-Device Edge AI </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wen%2C+D">Dingzhu Wen</a>, <a href="/search/eess?searchtype=author&query=Jiao%2C+X">Xiang Jiao</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+P">Peixi Liu</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+G">Guangxu Zhu</a>, <a href="/search/eess?searchtype=author&query=Shi%2C+Y">Yuanming Shi</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+K">Kaibin Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2211.01255v1-abstract-short" style="display: inline;"> Departing from the classic paradigm of data-centric designs, the 6G networks for supporting edge AI features task-oriented techniques that focus on effective and efficient execution of AI task. Targeting end-to-end system performance, such techniques are sophisticated as they aim to seamlessly integrate sensing (data acquisition), communication (data transmission), and computation (data processing… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.01255v1-abstract-full').style.display = 'inline'; document.getElementById('2211.01255v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2211.01255v1-abstract-full" style="display: none;"> Departing from the classic paradigm of data-centric designs, the 6G networks for supporting edge AI features task-oriented techniques that focus on effective and efficient execution of AI task. Targeting end-to-end system performance, such techniques are sophisticated as they aim to seamlessly integrate sensing (data acquisition), communication (data transmission), and computation (data processing). Aligned with the paradigm shift, a task-oriented over-the-air computation (AirComp) scheme is proposed in this paper for multi-device split-inference system. In the considered system, local feature vectors, which are extracted from the real-time noisy sensory data on devices, are aggregated over-the-air by exploiting the waveform superposition in a multiuser channel. Then the aggregated features as received at a server are fed into an inference model with the result used for decision making or control of actuators. To design inference-oriented AirComp, the transmit precoders at edge devices and receive beamforming at edge server are jointly optimized to rein in the aggregation error and maximize the inference accuracy. The problem is made tractable by measuring the inference accuracy using a surrogate metric called discriminant gain, which measures the discernibility of two object classes in the application of object/event classification. It is discovered that the conventional AirComp beamforming design for minimizing the mean square error in generic AirComp with respect to the noiseless case may not lead to the optimal classification accuracy. The reason is due to the overlooking of the fact that feature dimensions have different sensitivity towards aggregation errors and are thus of different importance levels for classification. This issue is addressed in this work via a new task-oriented AirComp scheme designed by directly maximizing the derived discriminant gain. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.01255v1-abstract-full').style.display = 'none'; document.getElementById('2211.01255v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2210.09659">arXiv:2210.09659</a> <span> [<a href="https://arxiv.org/pdf/2210.09659">pdf</a>, <a href="https://arxiv.org/ps/2210.09659">ps</a>, <a href="https://arxiv.org/format/2210.09659">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> Large-Scale Bandwidth and Power Optimization for Multi-Modal Edge Intelligence Autonomous Driving </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+X">Xinrao Li</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+T">Tong Zhang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+S">Shuai Wang</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+G">Guangxu Zhu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+R">Rui Wang</a>, <a href="/search/eess?searchtype=author&query=Chang%2C+T">Tsung-Hui Chang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2210.09659v2-abstract-short" style="display: inline;"> Edge intelligence autonomous driving (EIAD) offers computing resources in autonomous vehicles for training deep neural networks. However, wireless channels between the edge server and the autonomous vehicles are time-varying due to the high-mobility of vehicles. Moreover, the required number of training samples for different data modalities, e.g., images, point-clouds, is diverse. Consequently, wh… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.09659v2-abstract-full').style.display = 'inline'; document.getElementById('2210.09659v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2210.09659v2-abstract-full" style="display: none;"> Edge intelligence autonomous driving (EIAD) offers computing resources in autonomous vehicles for training deep neural networks. However, wireless channels between the edge server and the autonomous vehicles are time-varying due to the high-mobility of vehicles. Moreover, the required number of training samples for different data modalities, e.g., images, point-clouds, is diverse. Consequently, when collecting these datasets from vehicles to the edge server, the associated bandwidth and power allocation across all data frames is a large-scale multi-modal optimization problem. This article proposes a highly computationally efficient algorithm that directly maximizes the quality of training (QoT). The key ingredients include a data-driven model for quantifying the priority of data modality and two first-order methods termed accelerated gradient projection and dual decomposition for low-complexity resource allocation. Finally, high-fidelity simulations in Car Learning to Act (CARLA) show that the proposed algorithm reduces the perception error by $3\%$ and the computation time by $98\%$. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.09659v2-abstract-full').style.display = 'none'; document.getElementById('2210.09659v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 December, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to IEEE</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2207.14166">arXiv:2207.14166</a> <span> [<a href="https://arxiv.org/pdf/2207.14166">pdf</a>, <a href="https://arxiv.org/ps/2207.14166">ps</a>, <a href="https://arxiv.org/format/2207.14166">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> RHA-Net: An Encoder-Decoder Network with Residual Blocks and Hybrid Attention Mechanisms for Pavement Crack Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhu%2C+G">Guijie Zhu</a>, <a href="/search/eess?searchtype=author&query=Fan%2C+Z">Zhun Fan</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+J">Jiacheng Liu</a>, <a href="/search/eess?searchtype=author&query=Yuan%2C+D">Duan Yuan</a>, <a href="/search/eess?searchtype=author&query=Ma%2C+P">Peili Ma</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+M">Meihua Wang</a>, <a href="/search/eess?searchtype=author&query=Sheng%2C+W">Weihua Sheng</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+K+C+P">Kelvin C. P. Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2207.14166v1-abstract-short" style="display: inline;"> The acquisition and evaluation of pavement surface data play an essential role in pavement condition evaluation. In this paper, an efficient and effective end-to-end network for automatic pavement crack segmentation, called RHA-Net, is proposed to improve the pavement crack segmentation accuracy. The RHA-Net is built by integrating residual blocks (ResBlocks) and hybrid attention blocks into the e… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2207.14166v1-abstract-full').style.display = 'inline'; document.getElementById('2207.14166v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2207.14166v1-abstract-full" style="display: none;"> The acquisition and evaluation of pavement surface data play an essential role in pavement condition evaluation. In this paper, an efficient and effective end-to-end network for automatic pavement crack segmentation, called RHA-Net, is proposed to improve the pavement crack segmentation accuracy. The RHA-Net is built by integrating residual blocks (ResBlocks) and hybrid attention blocks into the encoder-decoder architecture. The ResBlocks are used to improve the ability of RHA-Net to extract high-level abstract features. The hybrid attention blocks are designed to fuse both low-level features and high-level features to help the model focus on correct channels and areas of cracks, thereby improving the feature presentation ability of RHA-Net. An image data set containing 789 pavement crack images collected by a self-designed mobile robot is constructed and used for training and evaluating the proposed model. Compared with other state-of-the-art networks, the proposed model achieves better performance and the functionalities of adding residual blocks and hybrid attention mechanisms are validated in a comprehensive ablation study. Additionally, a light-weighted version of the model generated by introducing depthwise separable convolution achieves better a performance and a much faster processing speed with 1/30 of the number of U-Net parameters. The developed system can segment pavement crack in real-time on an embedded device Jetson TX2 (25 FPS). The video taken in real-time experiments is released at https://youtu.be/3XIogk0fiG4. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2207.14166v1-abstract-full').style.display = 'none'; document.getElementById('2207.14166v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 July, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2206.05949">arXiv:2206.05949</a> <span> [<a href="https://arxiv.org/pdf/2206.05949">pdf</a>, <a href="https://arxiv.org/format/2206.05949">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/JSTSP.2022.3226836">10.1109/JSTSP.2022.3226836 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Toward Ambient Intelligence: Federated Edge Learning with Task-Oriented Sensing, Computation, and Communication Integration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Liu%2C+P">Peixi Liu</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+G">Guangxu Zhu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+S">Shuai Wang</a>, <a href="/search/eess?searchtype=author&query=Jiang%2C+W">Wei Jiang</a>, <a href="/search/eess?searchtype=author&query=Luo%2C+W">Wu Luo</a>, <a href="/search/eess?searchtype=author&query=Poor%2C+H+V">H. Vincent Poor</a>, <a href="/search/eess?searchtype=author&query=Cui%2C+S">Shuguang Cui</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2206.05949v1-abstract-short" style="display: inline;"> In this paper, we address the problem of joint sensing, computation, and communication (SC$^{2}$) resource allocation for federated edge learning (FEEL) via a concrete case study of human motion recognition based on wireless sensing in ambient intelligence. First, by analyzing the wireless sensing process in human motion recognition, we find that there exists a thresholding value for the sensing t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.05949v1-abstract-full').style.display = 'inline'; document.getElementById('2206.05949v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2206.05949v1-abstract-full" style="display: none;"> In this paper, we address the problem of joint sensing, computation, and communication (SC$^{2}$) resource allocation for federated edge learning (FEEL) via a concrete case study of human motion recognition based on wireless sensing in ambient intelligence. First, by analyzing the wireless sensing process in human motion recognition, we find that there exists a thresholding value for the sensing transmit power, exceeding which yields sensing data samples with approximately the same satisfactory quality. Then, the joint SC$^{2}$ resource allocation problem is cast to maximize the convergence speed of FEEL, under the constraints on training time, energy supply, and sensing quality of each edge device. Solving this problem entails solving two subproblems in order: the first one reduces to determine the joint sensing and communication resource allocation that maximizes the total number of samples that can be sensed during the entire training process; the second one concerns the partition of the attained total number of sensed samples over all the communication rounds to determine the batch size at each round for convergence speed maximization. The first subproblem on joint sensing and communication resource allocation is converted to a single-variable optimization problem by exploiting the derived relation between different control variables (resources), which thus allows an efficient solution via one-dimensional grid search. For the second subproblem, it is found that the number of samples to be sensed (or batch size) at each round is a decreasing function of the loss function value attained at the round. Based on this relationship, the approximate optimal batch size at each communication round is derived in closed-form as a function of the round index. Finally, extensive simulation results are provided to validate the superiority of the proposed joint SC$^{2}$ resource allocation scheme. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.05949v1-abstract-full').style.display = 'none'; document.getElementById('2206.05949v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 June, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, submitted to IEEE for possible publication</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2204.09079">arXiv:2204.09079</a> <span> [<a href="https://arxiv.org/pdf/2204.09079">pdf</a>, <a href="https://arxiv.org/format/2204.09079">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/LSP.2022.3219355">10.1109/LSP.2022.3219355 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Music Source Separation with Generative Flow </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhu%2C+G">Ge Zhu</a>, <a href="/search/eess?searchtype=author&query=Darefsky%2C+J">Jordan Darefsky</a>, <a href="/search/eess?searchtype=author&query=Jiang%2C+F">Fei Jiang</a>, <a href="/search/eess?searchtype=author&query=Selitskiy%2C+A">Anton Selitskiy</a>, <a href="/search/eess?searchtype=author&query=Duan%2C+Z">Zhiyao Duan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2204.09079v4-abstract-short" style="display: inline;"> Fully-supervised models for source separation are trained on parallel mixture-source data and are currently state-of-the-art. However, such parallel data is often difficult to obtain, and it is cumbersome to adapt trained models to mixtures with new sources. Source-only supervised models, in contrast, only require individual source data for training. In this paper, we first leverage flow-based gen… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2204.09079v4-abstract-full').style.display = 'inline'; document.getElementById('2204.09079v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2204.09079v4-abstract-full" style="display: none;"> Fully-supervised models for source separation are trained on parallel mixture-source data and are currently state-of-the-art. However, such parallel data is often difficult to obtain, and it is cumbersome to adapt trained models to mixtures with new sources. Source-only supervised models, in contrast, only require individual source data for training. In this paper, we first leverage flow-based generators to train individual music source priors and then use these models, along with likelihood-based objectives, to separate music mixtures. We show that in singing voice separation and music separation tasks, our proposed method is competitive with a fully-supervised approach. We also demonstrate that we can flexibly add new types of sources, whereas fully-supervised approaches would require retraining of the entire model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2204.09079v4-abstract-full').style.display = 'none'; document.getElementById('2204.09079v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 April, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by Signal Processing Letters</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2204.08169">arXiv:2204.08169</a> <span> [<a href="https://arxiv.org/pdf/2204.08169">pdf</a>, <a href="https://arxiv.org/ps/2204.08169">ps</a>, <a href="https://arxiv.org/format/2204.08169">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Actions at the Edge: Jointly Optimizing the Resources in Multi-access Edge Computing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Deng%2C+Y">Yiqin Deng</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+X">Xianhao Chen</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+G">Guangyu Zhu</a>, <a href="/search/eess?searchtype=author&query=Fang%2C+Y">Yuguang Fang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Z">Zhigang Chen</a>, <a href="/search/eess?searchtype=author&query=Deng%2C+X">Xiaoheng Deng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2204.08169v1-abstract-short" style="display: inline;"> Multi-access edge computing (MEC) is an emerging paradigm that pushes resources for sensing, communications, computing, storage and intelligence (SCCSI) to the premises closer to the end users, i.e., the edge, so that they could leverage the nearby rich resources to improve their quality of experience (QoE). Due to the growing emerging applications targeting at intelligentizing life-sustaining cyb… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2204.08169v1-abstract-full').style.display = 'inline'; document.getElementById('2204.08169v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2204.08169v1-abstract-full" style="display: none;"> Multi-access edge computing (MEC) is an emerging paradigm that pushes resources for sensing, communications, computing, storage and intelligence (SCCSI) to the premises closer to the end users, i.e., the edge, so that they could leverage the nearby rich resources to improve their quality of experience (QoE). Due to the growing emerging applications targeting at intelligentizing life-sustaining cyber-physical systems, this paradigm has become a hot research topic, particularly when MEC is utilized to provide edge intelligence and real-time processing and control. This article is to elaborate the research issues along this line, including basic concepts and performance metrics, killer applications, architectural design, modeling approaches and solutions, and future research directions. It is hoped that this article provides a quick introduction to this fruitful research area particularly for beginning researchers. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2204.08169v1-abstract-full').style.display = 'none'; document.getElementById('2204.08169v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 April, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages, 2 figures, accepted by IEEE Wireless Communications</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2203.15135">arXiv:2203.15135</a> <span> [<a href="https://arxiv.org/pdf/2203.15135">pdf</a>, <a href="https://arxiv.org/format/2203.15135">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Filler Word Detection and Classification: A Dataset and Benchmark </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhu%2C+G">Ge Zhu</a>, <a href="/search/eess?searchtype=author&query=Caceres%2C+J">Juan-Pablo Caceres</a>, <a href="/search/eess?searchtype=author&query=Salamon%2C+J">Justin Salamon</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2203.15135v2-abstract-short" style="display: inline;"> Filler words such as `uh' or `um' are sounds or words people use to signal they are pausing to think. Finding and removing filler words from recordings is a common and tedious task in media editing. Automatically detecting and classifying filler words could greatly aid in this task, but few studies have been published on this problem to date. A key reason is the absence of a dataset with annotated… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.15135v2-abstract-full').style.display = 'inline'; document.getElementById('2203.15135v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2203.15135v2-abstract-full" style="display: none;"> Filler words such as `uh' or `um' are sounds or words people use to signal they are pausing to think. Finding and removing filler words from recordings is a common and tedious task in media editing. Automatically detecting and classifying filler words could greatly aid in this task, but few studies have been published on this problem to date. A key reason is the absence of a dataset with annotated filler words for model training and evaluation. In this work, we present a novel speech dataset, PodcastFillers, with 35K annotated filler words and 50K annotations of other sounds that commonly occur in podcasts such as breaths, laughter, and word repetitions. We propose a pipeline that leverages VAD and ASR to detect filler candidates and a classifier to distinguish between filler word types. We evaluate our proposed pipeline on PodcastFillers, compare to several baselines, and present a detailed ablation study. In particular, we evaluate the importance of using ASR and how it compares to a transcription-free approach resembling keyword spotting. We show that our pipeline obtains state-of-the-art results, and that leveraging ASR strongly outperforms a keyword spotting approach. We make PodcastFillers publicly available, in the hope that our work serves as a benchmark for future research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.15135v2-abstract-full').style.display = 'none'; document.getElementById('2203.15135v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 July, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 March, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To appear at Insterspeech 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2202.11490">arXiv:2202.11490</a> <span> [<a href="https://arxiv.org/pdf/2202.11490">pdf</a>, <a href="https://arxiv.org/format/2202.11490">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Towards Tailored Models on Private AIoT Devices: Federated Direct Neural Architecture Search </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+C">Chunhui Zhang</a>, <a href="/search/eess?searchtype=author&query=Yuan%2C+X">Xiaoming Yuan</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Q">Qianyun Zhang</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+G">Guangxu Zhu</a>, <a href="/search/eess?searchtype=author&query=Cheng%2C+L">Lei Cheng</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+N">Ning Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2202.11490v1-abstract-short" style="display: inline;"> Neural networks often encounter various stringent resource constraints while deploying on edge devices. To tackle these problems with less human efforts, automated machine learning becomes popular in finding various neural architectures that fit diverse Artificial Intelligence of Things (AIoT) scenarios. Recently, to prevent the leakage of private information while enable automated machine intelli… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2202.11490v1-abstract-full').style.display = 'inline'; document.getElementById('2202.11490v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2202.11490v1-abstract-full" style="display: none;"> Neural networks often encounter various stringent resource constraints while deploying on edge devices. To tackle these problems with less human efforts, automated machine learning becomes popular in finding various neural architectures that fit diverse Artificial Intelligence of Things (AIoT) scenarios. Recently, to prevent the leakage of private information while enable automated machine intelligence, there is an emerging trend to integrate federated learning and neural architecture search (NAS). Although promising as it may seem, the coupling of difficulties from both tenets makes the algorithm development quite challenging. In particular, how to efficiently search the optimal neural architecture directly from massive non-independent and identically distributed (non-IID) data among AIoT devices in a federated manner is a hard nut to crack. In this paper, to tackle this challenge, by leveraging the advances in ProxylessNAS, we propose a Federated Direct Neural Architecture Search (FDNAS) framework that allows for hardware-friendly NAS from non- IID data across devices. To further adapt to both various data distributions and different types of devices with heterogeneous embedded hardware platforms, inspired by meta-learning, a Cluster Federated Direct Neural Architecture Search (CFDNAS) framework is proposed to achieve device-aware NAS, in the sense that each device can learn a tailored deep learning model for its particular data distribution and hardware constraint. Extensive experiments on non-IID datasets have shown the state-of-the-art accuracy-efficiency trade-offs achieved by the proposed solution in the presence of both data and device heterogeneity. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2202.11490v1-abstract-full').style.display = 'none'; document.getElementById('2202.11490v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 February, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">arXiv admin note: text overlap with arXiv:2011.03372</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2202.05253">arXiv:2202.05253</a> <span> [<a href="https://arxiv.org/pdf/2202.05253">pdf</a>, <a href="https://arxiv.org/format/2202.05253">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> A Probabilistic Fusion Framework for Spoofing Aware Speaker Verification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+Y">You Zhang</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+G">Ge Zhu</a>, <a href="/search/eess?searchtype=author&query=Duan%2C+Z">Zhiyao Duan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2202.05253v4-abstract-short" style="display: inline;"> The performance of automatic speaker verification (ASV) systems could be degraded by voice spoofing attacks. Most existing works aimed to develop standalone spoofing countermeasure (CM) systems. Relatively little work targeted at developing an integrated spoofing aware speaker verification (SASV) system. In the recent SASV challenge, the organizers encourage the development of such integration by… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2202.05253v4-abstract-full').style.display = 'inline'; document.getElementById('2202.05253v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2202.05253v4-abstract-full" style="display: none;"> The performance of automatic speaker verification (ASV) systems could be degraded by voice spoofing attacks. Most existing works aimed to develop standalone spoofing countermeasure (CM) systems. Relatively little work targeted at developing an integrated spoofing aware speaker verification (SASV) system. In the recent SASV challenge, the organizers encourage the development of such integration by releasing official protocols and baselines. In this paper, we build a probabilistic framework for fusing the ASV and CM subsystem scores. We further propose fusion strategies for direct inference and fine-tuning to predict the SASV score based on the framework. Surprisingly, these strategies significantly improve the SASV equal error rate (EER) from 19.31% of the baseline to 1.53% on the official evaluation trials of the SASV challenge. We verify the effectiveness of our proposed components through ablation studies and provide insights with score distribution analysis. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2202.05253v4-abstract-full').style.display = 'none'; document.getElementById('2202.05253v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 April, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 February, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 5 figures, to be appear in Odyssey 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2201.12581">arXiv:2201.12581</a> <span> [<a href="https://arxiv.org/pdf/2201.12581">pdf</a>, <a href="https://arxiv.org/format/2201.12581">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Integrated Sensing, Communication, and Computation Over-the-Air: MIMO Beamforming Design </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+X">Xiaoyang Li</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+F">Fan Liu</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+Z">Ziqin Zhou</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+G">Guangxu Zhu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+S">Shuai Wang</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+K">Kaibin Huang</a>, <a href="/search/eess?searchtype=author&query=Gong%2C+Y">Yi Gong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2201.12581v2-abstract-short" style="display: inline;"> To support the unprecedented growth of the Internet of Things (IoT) applications, tremendous data need to be collected by the IoT devices and delivered to the server for further computation. By utilizing the same signals for both radar sensing and data communication, the integrated sensing and communication (ISAC) technique has broken the barriers between data collection and delivery in the physic… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2201.12581v2-abstract-full').style.display = 'inline'; document.getElementById('2201.12581v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2201.12581v2-abstract-full" style="display: none;"> To support the unprecedented growth of the Internet of Things (IoT) applications, tremendous data need to be collected by the IoT devices and delivered to the server for further computation. By utilizing the same signals for both radar sensing and data communication, the integrated sensing and communication (ISAC) technique has broken the barriers between data collection and delivery in the physical layer. By exploiting the analog-wave addition in a multi-access channel, over-the-air computation (AirComp) enables function computation via transmissions in the physical layer. The promising performance of ISAC and AirComp motivates the current work on developing a framework called integrated sensing, communication, and computation over-the-air (ISCCO). The performance metrics of radar sensing and AirComp are evaluated by the mean squared errors of the estimated target response matrix and the received computation results, respectively. The design challenge of MIMO ISCCO lies in the joint optimization of beamformers for sensing, communication, and computation at both the IoT devices and the server, which results in a non-convex problem. To solve this problem, an algorithmic solution based on the technique of semidefinite relaxation is proposed. The use case of target location estimation based on ISCCO is demonstrated in simulation to show the performance superiority. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2201.12581v2-abstract-full').style.display = 'none'; document.getElementById('2201.12581v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 February, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 January, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper has been submitted to IEEE for possible publication</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Zhu%2C+G&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Zhu%2C+G&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhu%2C+G&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>