CINXE.COM
Search | arXiv e-print repository
<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 144 results for author: <span class="mathjax">Kang, J</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> <div class="content"> <form method="GET" action="/search/eess" aria-role="search"> Searching in archive <strong>eess</strong>. <a href="/search/?searchtype=author&query=Kang%2C+J">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Kang, J"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Kang%2C+J&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Kang, J"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Kang%2C+J&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Kang%2C+J&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Kang%2C+J&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Kang%2C+J&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.06149">arXiv:2503.06149</a> <span> [<a href="https://arxiv.org/pdf/2503.06149">pdf</a>, <a href="https://arxiv.org/format/2503.06149">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Wireless Hallucination in Generative AI-enabled Communications: Concepts, Issues, and Solutions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+X">Xudong Wang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+J">Jiacheng Wang</a>, <a href="/search/eess?searchtype=author&query=Feng%2C+L">Lei Feng</a>, <a href="/search/eess?searchtype=author&query=Niyato%2C+D">Dusit Niyato</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+R">Ruichen Zhang</a>, <a href="/search/eess?searchtype=author&query=Kang%2C+J">Jiawen Kang</a>, <a href="/search/eess?searchtype=author&query=Xiong%2C+Z">Zehui Xiong</a>, <a href="/search/eess?searchtype=author&query=Du%2C+H">Hongyang Du</a>, <a href="/search/eess?searchtype=author&query=Mao%2C+S">Shiwen Mao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.06149v1-abstract-short" style="display: inline;"> Generative AI (GenAI) is driving the intelligence of wireless communications. Due to data limitations, random generation, and dynamic environments, GenAI may generate channel information or optimization strategies that violate physical laws or deviate from actual real-world requirements. We refer to this phenomenon as wireless hallucination, which results in invalid channel information, spectrum w… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06149v1-abstract-full').style.display = 'inline'; document.getElementById('2503.06149v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.06149v1-abstract-full" style="display: none;"> Generative AI (GenAI) is driving the intelligence of wireless communications. Due to data limitations, random generation, and dynamic environments, GenAI may generate channel information or optimization strategies that violate physical laws or deviate from actual real-world requirements. We refer to this phenomenon as wireless hallucination, which results in invalid channel information, spectrum wastage, and low communication reliability but remains underexplored. To address this gap, this article provides a comprehensive concept of wireless hallucinations in GenAI-driven communications, focusing on hallucination mitigation. Specifically, we first introduce the fundamental, analyze its causes based on the GenAI workflow, and propose mitigation solutions at the data, model, and post-generation levels. Then, we systematically examines representative hallucination scenarios in GenAI-enabled communications and their corresponding solutions. Finally, we propose a novel integrated mitigation solution for GenAI-based channel estimation. At the data level, we establish a channel estimation hallucination dataset and employ generative adversarial networks (GANs)-based data augmentation. Additionally, we incorporate attention mechanisms and large language models (LLMs) to enhance both training and inference performance. Experimental results demonstrate that the proposed hybrid solutions reduce the normalized mean square error (NMSE) by 0.19, effectively reducing wireless hallucinations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06149v1-abstract-full').style.display = 'none'; document.getElementById('2503.06149v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages, 4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.17475">arXiv:2502.17475</a> <span> [<a href="https://arxiv.org/pdf/2502.17475">pdf</a>, <a href="https://arxiv.org/format/2502.17475">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> ECG-Expert-QA: A Benchmark for Evaluating Medical Large Language Models in Heart Disease Diagnosis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+X">Xu Wang</a>, <a href="/search/eess?searchtype=author&query=Kang%2C+J">Jiaju Kang</a>, <a href="/search/eess?searchtype=author&query=Han%2C+P">Puyu Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.17475v2-abstract-short" style="display: inline;"> We present ECG-Expert-QA, a comprehensive multimodal dataset designed for evaluating diagnostic capabilities in ECG interpretation, integrating real clinical data with systematically generated synthetic cases. The dataset encompasses six fundamental diagnostic tasks, comprising 47,211 meticulously curated question-answer pairs that span a spectrum of clinical scenarios, from basic rhythm analysis… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.17475v2-abstract-full').style.display = 'inline'; document.getElementById('2502.17475v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.17475v2-abstract-full" style="display: none;"> We present ECG-Expert-QA, a comprehensive multimodal dataset designed for evaluating diagnostic capabilities in ECG interpretation, integrating real clinical data with systematically generated synthetic cases. The dataset encompasses six fundamental diagnostic tasks, comprising 47,211 meticulously curated question-answer pairs that span a spectrum of clinical scenarios, from basic rhythm analysis to complex case interpretation. By simulating challenging clinical cases through a rigorous medical knowledge-guided process, ECG-Expert-QA not only enhances the availability of annotated diagnostic data but also significantly increases the complexity and diversity of clinical presentations, including rare cardiac conditions and temporal progression patterns. This design enables comprehensive evaluation of medical language models across multiple dimensions, including diagnostic accuracy, clinical reasoning, and knowledge integration. To facilitate global research collaboration, ECG-Expert-QA is available in both Chinese and English versions, with rigorous quality control ensuring linguistic and clinical consistency. The dataset's challenging diagnostic tasks, which include interpretation of complex arrhythmias, identification of subtle ischemic changes, and integration of clinical context, establish it as an effective benchmark for advancing AI-assisted ECG interpretation and pushing the boundaries of current diagnostic models. Our dataset is open-source and available at https://github.com/Zaozzz/ECG-Expert-QA <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.17475v2-abstract-full').style.display = 'none'; document.getElementById('2502.17475v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.12622">arXiv:2502.12622</a> <span> [<a href="https://arxiv.org/pdf/2502.12622">pdf</a>, <a href="https://arxiv.org/format/2502.12622">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Generative AI Enabled Robust Data Augmentation for Wireless Sensing in ISAC Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+J">Jiacheng Wang</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+C">Changyuan Zhao</a>, <a href="/search/eess?searchtype=author&query=Du%2C+H">Hongyang Du</a>, <a href="/search/eess?searchtype=author&query=Sun%2C+G">Geng Sun</a>, <a href="/search/eess?searchtype=author&query=Kang%2C+J">Jiawen Kang</a>, <a href="/search/eess?searchtype=author&query=Mao%2C+S">Shiwen Mao</a>, <a href="/search/eess?searchtype=author&query=Niyato%2C+D">Dusit Niyato</a>, <a href="/search/eess?searchtype=author&query=Kim%2C+D+I">Dong In Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.12622v1-abstract-short" style="display: inline;"> Integrated sensing and communication (ISAC) uses the same software and hardware resources to achieve both communication and sensing functionalities. Thus, it stands as one of the core technologies of 6G and has garnered significant attention in recent years. In ISAC systems, a variety of machine learning models are trained to analyze and identify signal patterns, thereby ensuring reliable sensing… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12622v1-abstract-full').style.display = 'inline'; document.getElementById('2502.12622v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.12622v1-abstract-full" style="display: none;"> Integrated sensing and communication (ISAC) uses the same software and hardware resources to achieve both communication and sensing functionalities. Thus, it stands as one of the core technologies of 6G and has garnered significant attention in recent years. In ISAC systems, a variety of machine learning models are trained to analyze and identify signal patterns, thereby ensuring reliable sensing and communications. However, considering factors such as communication rates, costs, and privacy, collecting sufficient training data from various ISAC scenarios for these models is impractical. Hence, this paper introduces a generative AI (GenAI) enabled robust data augmentation scheme. The scheme first employs a conditioned diffusion model trained on a limited amount of collected CSI data to generate new samples, thereby expanding the sample quantity. Building on this, the scheme further utilizes another diffusion model to enhance the sample quality, thereby facilitating the data augmentation in scenarios where the original sensing data is insufficient and unevenly distributed. Moreover, we propose a novel algorithm to estimate the acceleration and jerk of signal propagation path length changes from CSI. We then use the proposed scheme to enhance the estimated parameters and detect the number of targets based on the enhanced data. The evaluation reveals that our scheme improves the detection performance by up to 70%, demonstrating reliability and robustness, which supports the deployment and practical use of the ISAC network. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12622v1-abstract-full').style.display = 'none'; document.getElementById('2502.12622v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 10 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.04056">arXiv:2502.04056</a> <span> [<a href="https://arxiv.org/pdf/2502.04056">pdf</a>, <a href="https://arxiv.org/format/2502.04056">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> TQ-DiT: Efficient Time-Aware Quantization for Diffusion Transformers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Hwang%2C+Y">Younghye Hwang</a>, <a href="/search/eess?searchtype=author&query=Lee%2C+H">Hyojin Lee</a>, <a href="/search/eess?searchtype=author&query=Kang%2C+J">Joonhyuk Kang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.04056v1-abstract-short" style="display: inline;"> Diffusion transformers (DiTs) combine transformer architectures with diffusion models. However, their computational complexity imposes significant limitations on real-time applications and sustainability of AI systems. In this study, we aim to enhance the computational efficiency through model quantization, which represents the weights and activation values with lower precision. Multi-region quant… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04056v1-abstract-full').style.display = 'inline'; document.getElementById('2502.04056v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.04056v1-abstract-full" style="display: none;"> Diffusion transformers (DiTs) combine transformer architectures with diffusion models. However, their computational complexity imposes significant limitations on real-time applications and sustainability of AI systems. In this study, we aim to enhance the computational efficiency through model quantization, which represents the weights and activation values with lower precision. Multi-region quantization (MRQ) is introduced to address the asymmetric distribution of network values in DiT blocks by allocating two scaling parameters to sub-regions. Additionally, time-grouping quantization (TGQ) is proposed to reduce quantization error caused by temporal variation in activations. The experimental results show that the proposed algorithm achieves performance comparable to the original full-precision model with only a 0.29 increase in FID at W8A8. Furthermore, it outperforms other baselines at W6A6, thereby confirming its suitability for low-bit quantization. These results highlight the potential of our method to enable efficient real-time generative models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04056v1-abstract-full').style.display = 'none'; document.getElementById('2502.04056v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.03979">arXiv:2502.03979</a> <span> [<a href="https://arxiv.org/pdf/2502.03979">pdf</a>, <a href="https://arxiv.org/format/2502.03979">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Towards Unified Music Emotion Recognition across Dimensional and Categorical Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Kang%2C+J">Jaeyong Kang</a>, <a href="/search/eess?searchtype=author&query=Herremans%2C+D">Dorien Herremans</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.03979v1-abstract-short" style="display: inline;"> One of the most significant challenges in Music Emotion Recognition (MER) comes from the fact that emotion labels can be heterogeneous across datasets with regard to the emotion representation, including categorical (e.g., happy, sad) versus dimensional labels (e.g., valence-arousal). In this paper, we present a unified multitask learning framework that combines these two types of labels and is th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03979v1-abstract-full').style.display = 'inline'; document.getElementById('2502.03979v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.03979v1-abstract-full" style="display: none;"> One of the most significant challenges in Music Emotion Recognition (MER) comes from the fact that emotion labels can be heterogeneous across datasets with regard to the emotion representation, including categorical (e.g., happy, sad) versus dimensional labels (e.g., valence-arousal). In this paper, we present a unified multitask learning framework that combines these two types of labels and is thus able to be trained on multiple datasets. This framework uses an effective input representation that combines musical features (i.e., key and chords) and MERT embeddings. Moreover, knowledge distillation is employed to transfer the knowledge of teacher models trained on individual datasets to a student model, enhancing its ability to generalize across multiple tasks. To validate our proposed framework, we conducted extensive experiments on a variety of datasets, including MTG-Jamendo, DEAM, PMEmo, and EmoMusic. According to our experimental results, the inclusion of musical features, multitask learning, and knowledge distillation significantly enhances performance. In particular, our model outperforms the state-of-the-art models, including the best-performing model from the MediaEval 2021 competition on the MTG-Jamendo dataset. Our work makes a significant contribution to MER by allowing the combination of categorical and dimensional emotion labels in one unified framework, thus enabling training across datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03979v1-abstract-full').style.display = 'none'; document.getElementById('2502.03979v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.15311">arXiv:2501.15311</a> <span> [<a href="https://arxiv.org/pdf/2501.15311">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Kalman filter/deep-learning hybrid automatic boundary tracking of optical coherence tomography data for deep anterior lamellar keratoplasty (DALK) </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yi%2C+H">Hongrui Yi</a>, <a href="/search/eess?searchtype=author&query=Yu%2C+J">Jinglun Yu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yaning Wang</a>, <a href="/search/eess?searchtype=author&query=Opfermann%2C+J">Justin Opfermann</a>, <a href="/search/eess?searchtype=author&query=Gensheimer%2C+B+G">Bill G. Gensheimer</a>, <a href="/search/eess?searchtype=author&query=Kriger%2C+A">Axel Kriger</a>, <a href="/search/eess?searchtype=author&query=Kang%2C+J+U">Jin U. Kang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.15311v2-abstract-short" style="display: inline;"> Deep anterior lamellar keratoplasty (DALK) is a highly challenging partial thickness cornea transplant surgery that replaces the anterior cornea above Descemet's membrane (DM) with a donor cornea. In our previous work, we proposed the design of an optical coherence tomography (OCT) sensor integrated needle to acquire real-time M-mode images to provide depth feedback during OCT-guided needle insert… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15311v2-abstract-full').style.display = 'inline'; document.getElementById('2501.15311v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.15311v2-abstract-full" style="display: none;"> Deep anterior lamellar keratoplasty (DALK) is a highly challenging partial thickness cornea transplant surgery that replaces the anterior cornea above Descemet's membrane (DM) with a donor cornea. In our previous work, we proposed the design of an optical coherence tomography (OCT) sensor integrated needle to acquire real-time M-mode images to provide depth feedback during OCT-guided needle insertion during Big Bubble DALK procedures. Machine learning and deep learning techniques were applied to M-mode images to automatically identify the DM in OCT M-scan data. However, such segmentation methods often produce inconsistent or jagged segmentation of the DM which reduces the model accuracy. Here we present a Kalman filter based OCT M-scan boundary tracking algorithm in addition to AI-based precise needle guidance to improve automatic DM segmentation for OCT-guided DALK procedures. By using the Kalman filter, the proposed method generates a smoother layer segmentation result from OCT M-mode images for more accurate tracking of the DM layer and epithelium. Initial ex vivo testing demonstrates that the proposed approach significantly increases the segmentation accuracy compared to conventional methods without the Kalman filter. Our proposed model can provide more consistent and precise depth sensing results, which has great potential to improve surgical safety and ultimately contributes to better patient outcomes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15311v2-abstract-full').style.display = 'none'; document.getElementById('2501.15311v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.14566">arXiv:2501.14566</a> <span> [<a href="https://arxiv.org/pdf/2501.14566">pdf</a>, <a href="https://arxiv.org/format/2501.14566">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Calibrating Wireless AI via Meta-Learned Context-Dependent Conformal Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yoo%2C+S">Seonghoon Yoo</a>, <a href="/search/eess?searchtype=author&query=Park%2C+S">Sangwoo Park</a>, <a href="/search/eess?searchtype=author&query=Popovski%2C+P">Petar Popovski</a>, <a href="/search/eess?searchtype=author&query=Kang%2C+J">Joonhyuk Kang</a>, <a href="/search/eess?searchtype=author&query=Simeone%2C+O">Osvaldo Simeone</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.14566v3-abstract-short" style="display: inline;"> Modern software-defined networks, such as Open Radio Access Network (O-RAN) systems, rely on artificial intelligence (AI)-powered applications running on controllers interfaced with the radio access network. To ensure that these AI applications operate reliably at runtime, they must be properly calibrated before deployment. A promising and theoretically grounded approach to calibration is conforma… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14566v3-abstract-full').style.display = 'inline'; document.getElementById('2501.14566v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.14566v3-abstract-full" style="display: none;"> Modern software-defined networks, such as Open Radio Access Network (O-RAN) systems, rely on artificial intelligence (AI)-powered applications running on controllers interfaced with the radio access network. To ensure that these AI applications operate reliably at runtime, they must be properly calibrated before deployment. A promising and theoretically grounded approach to calibration is conformal prediction (CP), which enhances any AI model by transforming it into a provably reliable set predictor that provides error bars for estimates and decisions. CP requires calibration data that matches the distribution of the environment encountered during runtime. However, in practical scenarios, network controllers often have access only to data collected under different contexts -- such as varying traffic patterns and network conditions -- leading to a mismatch between the calibration and runtime distributions. This paper introduces a novel methodology to address this calibration-test distribution shift. The approach leverages meta-learning to develop a zero-shot estimator of distribution shifts, relying solely on contextual information. The proposed method, called meta-learned context-dependent weighted conformal prediction (ML-WCP), enables effective calibration of AI applications without requiring data from the current context. Additionally, it can incorporate data from multiple contexts to further enhance calibration reliability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14566v3-abstract-full').style.display = 'none'; document.getElementById('2501.14566v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.04735">arXiv:2501.04735</a> <span> [<a href="https://arxiv.org/pdf/2501.04735">pdf</a>, <a href="https://arxiv.org/format/2501.04735">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Topology-based deep-learning segmentation method for deep anterior lamellar keratoplasty (DALK) surgical guidance using M-mode OCT data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yu%2C+J">J. Yu</a>, <a href="/search/eess?searchtype=author&query=Yi%2C+H">H. Yi</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Y. Wang</a>, <a href="/search/eess?searchtype=author&query=Opfermann%2C+J+D">J. D. Opfermann</a>, <a href="/search/eess?searchtype=author&query=Gensheimer%2C+W+G">W. G. Gensheimer</a>, <a href="/search/eess?searchtype=author&query=Krieger%2C+A">A. Krieger</a>, <a href="/search/eess?searchtype=author&query=Kang%2C+J+U">J. U. Kang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.04735v1-abstract-short" style="display: inline;"> Deep Anterior Lamellar Keratoplasty (DALK) is a partial-thickness corneal transplant procedure used to treat corneal stromal diseases. A crucial step in this procedure is the precise separation of the deep stroma from Descemet's membrane (DM) using the Big Bubble technique. To simplify the tasks of needle insertion and pneumo-dissection in this technique, we previously developed an Optical Coheren… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.04735v1-abstract-full').style.display = 'inline'; document.getElementById('2501.04735v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.04735v1-abstract-full" style="display: none;"> Deep Anterior Lamellar Keratoplasty (DALK) is a partial-thickness corneal transplant procedure used to treat corneal stromal diseases. A crucial step in this procedure is the precise separation of the deep stroma from Descemet's membrane (DM) using the Big Bubble technique. To simplify the tasks of needle insertion and pneumo-dissection in this technique, we previously developed an Optical Coherence Tomography (OCT)-guided, eye-mountable robot that uses real-time tracking of corneal layers from M-mode OCT signals for control. However, signal noise and instability during manipulation of the OCT fiber sensor-integrated needle have hindered the performance of conventional deep-learning segmentation methods, resulting in rough and inaccurate detection of corneal layers. To address these challenges, we have developed a topology-based deep-learning segmentation method that integrates a topological loss function with a modified network architecture. This approach effectively reduces the effects of noise and improves segmentation speed, precision, and stability. Validation using in vivo, ex vivo, and hybrid rabbit eye datasets demonstrates that our method outperforms traditional loss-based techniques, providing fast, accurate, and robust segmentation of the epithelium and DM to guide surgery. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.04735v1-abstract-full').style.display = 'none'; document.getElementById('2501.04735v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.03727">arXiv:2501.03727</a> <span> [<a href="https://arxiv.org/pdf/2501.03727">pdf</a>, <a href="https://arxiv.org/format/2501.03727">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Detecting Neurocognitive Disorders through Analyses of Topic Evolution and Cross-modal Consistency in Visual-Stimulated Narratives </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+J">Jinchao Li</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yuejiao Wang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+J">Junan Li</a>, <a href="/search/eess?searchtype=author&query=Kang%2C+J">Jiawen Kang</a>, <a href="/search/eess?searchtype=author&query=Zheng%2C+B">Bo Zheng</a>, <a href="/search/eess?searchtype=author&query=Wong%2C+S">Simon Wong</a>, <a href="/search/eess?searchtype=author&query=Mak%2C+B">Brian Mak</a>, <a href="/search/eess?searchtype=author&query=Fung%2C+H">Helene Fung</a>, <a href="/search/eess?searchtype=author&query=Woo%2C+J">Jean Woo</a>, <a href="/search/eess?searchtype=author&query=Mak%2C+M">Man-Wai Mak</a>, <a href="/search/eess?searchtype=author&query=Kwok%2C+T">Timothy Kwok</a>, <a href="/search/eess?searchtype=author&query=Mok%2C+V">Vincent Mok</a>, <a href="/search/eess?searchtype=author&query=Gong%2C+X">Xianmin Gong</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+X">Xixin Wu</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+X">Xunying Liu</a>, <a href="/search/eess?searchtype=author&query=Wong%2C+P">Patrick Wong</a>, <a href="/search/eess?searchtype=author&query=Meng%2C+H">Helen Meng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.03727v1-abstract-short" style="display: inline;"> Early detection of neurocognitive disorders (NCDs) is crucial for timely intervention and disease management. Speech analysis offers a non-intrusive and scalable screening method, particularly through narrative tasks in neuropsychological assessment tools. Traditional narrative analysis often focuses on local indicators in microstructure, such as word usage and syntax. While these features provide… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03727v1-abstract-full').style.display = 'inline'; document.getElementById('2501.03727v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.03727v1-abstract-full" style="display: none;"> Early detection of neurocognitive disorders (NCDs) is crucial for timely intervention and disease management. Speech analysis offers a non-intrusive and scalable screening method, particularly through narrative tasks in neuropsychological assessment tools. Traditional narrative analysis often focuses on local indicators in microstructure, such as word usage and syntax. While these features provide insights into language production abilities, they often fail to capture global narrative patterns, or microstructures. Macrostructures include coherence, thematic organization, and logical progressions, reflecting essential cognitive skills potentially critical for recognizing NCDs. Addressing this gap, we propose to investigate specific cognitive and linguistic challenges by analyzing topical shifts, temporal dynamics, and the coherence of narratives over time, aiming to reveal cognitive deficits by identifying narrative impairments, and exploring their impact on communication and cognition. The investigation is based on the CU-MARVEL Rabbit Story corpus, which comprises recordings of a story-telling task from 758 older adults. We developed two approaches: the Dynamic Topic Models (DTM)-based temporal analysis to examine the evolution of topics over time, and the Text-Image Temporal Alignment Network (TITAN) to evaluate the coherence between spoken narratives and visual stimuli. DTM-based approach validated the effectiveness of dynamic topic consistency as a macrostructural metric (F1=0.61, AUC=0.78). The TITAN approach achieved the highest performance (F1=0.72, AUC=0.81), surpassing established microstructural and macrostructural feature sets. Cross-comparison and regression tasks further demonstrated the effectiveness of proposed dynamic macrostructural modeling approaches for NCD detection. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03727v1-abstract-full').style.display = 'none'; document.getElementById('2501.03727v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 8 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.01681">arXiv:2501.01681</a> <span> [<a href="https://arxiv.org/pdf/2501.01681">pdf</a>, <a href="https://arxiv.org/format/2501.01681">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1007/978-3-031-73001-6_19">10.1007/978-3-031-73001-6_19 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> SNeRV: Spectra-preserving Neural Representation for Video </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Kim%2C+J">Jina Kim</a>, <a href="/search/eess?searchtype=author&query=Lee%2C+J">Jihoo Lee</a>, <a href="/search/eess?searchtype=author&query=Kang%2C+J">Je-Won Kang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.01681v1-abstract-short" style="display: inline;"> Neural representation for video (NeRV), which employs a neural network to parameterize video signals, introduces a novel methodology in video representations. However, existing NeRV-based methods have difficulty in capturing fine spatial details and motion patterns due to spectral bias, in which a neural network learns high-frequency (HF) components at a slower rate than low-frequency (LF) compone… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.01681v1-abstract-full').style.display = 'inline'; document.getElementById('2501.01681v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.01681v1-abstract-full" style="display: none;"> Neural representation for video (NeRV), which employs a neural network to parameterize video signals, introduces a novel methodology in video representations. However, existing NeRV-based methods have difficulty in capturing fine spatial details and motion patterns due to spectral bias, in which a neural network learns high-frequency (HF) components at a slower rate than low-frequency (LF) components. In this paper, we propose spectra-preserving NeRV (SNeRV) as a novel approach to enhance implicit video representations by efficiently handling various frequency components. SNeRV uses 2D discrete wavelet transform (DWT) to decompose video into LF and HF features, preserving spatial structures and directly addressing the spectral bias issue. To balance the compactness, we encode only the LF components, while HF components that include fine textures are generated by a decoder. Specialized modules, including a multi-resolution fusion unit (MFU) and a high-frequency restorer (HFR), are integrated into a backbone to facilitate the representation. Furthermore, we extend SNeRV to effectively capture temporal correlations between adjacent video frames, by casting the extension as additional frequency decomposition to a temporal domain. This approach allows us to embed spatio-temporal LF features into the network, using temporally extended up-sampling blocks (TUBs). Experimental results demonstrate that SNeRV outperforms existing NeRV models in capturing fine details and achieves enhanced reconstruction, making it a promising approach in the field of implicit video representations. The codes are available at https://github.com/qwertja/SNeRV. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.01681v1-abstract-full').style.display = 'none'; document.getElementById('2501.01681v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ECCV 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.19111">arXiv:2412.19111</a> <span> [<a href="https://arxiv.org/pdf/2412.19111">pdf</a>, <a href="https://arxiv.org/format/2412.19111">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Spectral Enhancement and Pseudo-Anchor Guidance for Infrared-Visible Person Re-Identification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ge%2C+Y">Yiyuan Ge</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Z">Zhihao Chen</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Z">Ziyang Wang</a>, <a href="/search/eess?searchtype=author&query=Kang%2C+J">Jiaju Kang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+M">Mingya Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.19111v2-abstract-short" style="display: inline;"> The development of deep learning has facilitated the application of person re-identification (ReID) technology in intelligent security. Visible-infrared person re-identification (VI-ReID) aims to match pedestrians across infrared and visible modality images enabling 24-hour surveillance. Current studies relying on unsupervised modality transformations as well as inefficient embedding constraints t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.19111v2-abstract-full').style.display = 'inline'; document.getElementById('2412.19111v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.19111v2-abstract-full" style="display: none;"> The development of deep learning has facilitated the application of person re-identification (ReID) technology in intelligent security. Visible-infrared person re-identification (VI-ReID) aims to match pedestrians across infrared and visible modality images enabling 24-hour surveillance. Current studies relying on unsupervised modality transformations as well as inefficient embedding constraints to bridge the spectral differences between infrared and visible images, however, limit their potential performance. To tackle the limitations of the above approaches, this paper introduces a simple yet effective Spectral Enhancement and Pseudo-anchor Guidance Network, named SEPG-Net. Specifically, we propose a more homogeneous spectral enhancement scheme based on frequency domain information and greyscale space, which avoids the information loss typically caused by inefficient modality transformations. Further, a Pseudo Anchor-guided Bidirectional Aggregation (PABA) loss is introduced to bridge local modality discrepancies while better preserving discriminative identity embeddings. Experimental results on two public benchmark datasets demonstrate the superior performance of SEPG-Net against other state-of-the-art methods. The code is available at https://github.com/1024AILab/ReID-SEPG. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.19111v2-abstract-full').style.display = 'none'; document.getElementById('2412.19111v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.17387">arXiv:2411.17387</a> <span> [<a href="https://arxiv.org/pdf/2411.17387">pdf</a>, <a href="https://arxiv.org/format/2411.17387">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Robust Bayesian Optimization via Localized Online Conformal Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Kim%2C+D">Dongwon Kim</a>, <a href="/search/eess?searchtype=author&query=Zecchin%2C+M">Matteo Zecchin</a>, <a href="/search/eess?searchtype=author&query=Park%2C+S">Sangwoo Park</a>, <a href="/search/eess?searchtype=author&query=Kang%2C+J">Joonhyuk Kang</a>, <a href="/search/eess?searchtype=author&query=Simeone%2C+O">Osvaldo Simeone</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.17387v1-abstract-short" style="display: inline;"> Bayesian optimization (BO) is a sequential approach for optimizing black-box objective functions using zeroth-order noisy observations. In BO, Gaussian processes (GPs) are employed as probabilistic surrogate models to estimate the objective function based on past observations, guiding the selection of future queries to maximize utility. However, the performance of BO heavily relies on the quality… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17387v1-abstract-full').style.display = 'inline'; document.getElementById('2411.17387v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.17387v1-abstract-full" style="display: none;"> Bayesian optimization (BO) is a sequential approach for optimizing black-box objective functions using zeroth-order noisy observations. In BO, Gaussian processes (GPs) are employed as probabilistic surrogate models to estimate the objective function based on past observations, guiding the selection of future queries to maximize utility. However, the performance of BO heavily relies on the quality of these probabilistic estimates, which can deteriorate significantly under model misspecification. To address this issue, we introduce localized online conformal prediction-based Bayesian optimization (LOCBO), a BO algorithm that calibrates the GP model through localized online conformal prediction (CP). LOCBO corrects the GP likelihood based on predictive sets produced by LOCBO, and the corrected GP likelihood is then denoised to obtain a calibrated posterior distribution on the objective function. The likelihood calibration step leverages an input-dependent calibration threshold to tailor coverage guarantees to different regions of the input space. Under minimal noise assumptions, we provide theoretical performance guarantees for LOCBO's iterates that hold for the unobserved objective function. These theoretical findings are validated through experiments on synthetic and real-world optimization tasks, demonstrating that LOCBO consistently outperforms state-of-the-art BO algorithms in the presence of model misspecification. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17387v1-abstract-full').style.display = 'none'; document.getElementById('2411.17387v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23824">arXiv:2410.23824</a> <span> [<a href="https://arxiv.org/pdf/2410.23824">pdf</a>, <a href="https://arxiv.org/format/2410.23824">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Generative AI-Powered Plugin for Robust Federated Learning in Heterogeneous IoT Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Lee%2C+Y">Youngjoon Lee</a>, <a href="/search/eess?searchtype=author&query=Gong%2C+J">Jinu Gong</a>, <a href="/search/eess?searchtype=author&query=Kang%2C+J">Joonhyuk Kang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23824v1-abstract-short" style="display: inline;"> Federated learning enables edge devices to collaboratively train a global model while maintaining data privacy by keeping data localized. However, the Non-IID nature of data distribution across devices often hinders model convergence and reduces performance. In this paper, we propose a novel plugin for federated optimization techniques that approximates Non-IID data distributions to IID through ge… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23824v1-abstract-full').style.display = 'inline'; document.getElementById('2410.23824v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23824v1-abstract-full" style="display: none;"> Federated learning enables edge devices to collaboratively train a global model while maintaining data privacy by keeping data localized. However, the Non-IID nature of data distribution across devices often hinders model convergence and reduces performance. In this paper, we propose a novel plugin for federated optimization techniques that approximates Non-IID data distributions to IID through generative AI-enhanced data augmentation and balanced sampling strategy. Key idea is to synthesize additional data for underrepresented classes on each edge device, leveraging generative AI to create a more balanced dataset across the FL network. Additionally, a balanced sampling approach at the central server selectively includes only the most IID-like devices, accelerating convergence while maximizing the global model's performance. Experimental results validate that our approach significantly improves convergence speed and robustness against data imbalance, establishing a flexible, privacy-preserving FL plugin that is applicable even in data-scarce environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23824v1-abstract-full').style.display = 'none'; document.getElementById('2410.23824v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.14577">arXiv:2410.14577</a> <span> [<a href="https://arxiv.org/pdf/2410.14577">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Reimagining partial thickness keratoplasty: An eye mountable robot for autonomous big bubble needle insertion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Y. Wang</a>, <a href="/search/eess?searchtype=author&query=Opfermann%2C+J+D">J. D. Opfermann</a>, <a href="/search/eess?searchtype=author&query=Yu%2C+J">J. Yu</a>, <a href="/search/eess?searchtype=author&query=Yi%2C+H">H. Yi</a>, <a href="/search/eess?searchtype=author&query=Kaluna%2C+J">J. Kaluna</a>, <a href="/search/eess?searchtype=author&query=Biswas%2C+R">R. Biswas</a>, <a href="/search/eess?searchtype=author&query=Zuo%2C+R">R. Zuo</a>, <a href="/search/eess?searchtype=author&query=Gensheimer%2C+W">W. Gensheimer</a>, <a href="/search/eess?searchtype=author&query=Krieger%2C+A">A. Krieger</a>, <a href="/search/eess?searchtype=author&query=Kang%2C+J+U">J. U. Kang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.14577v1-abstract-short" style="display: inline;"> Autonomous surgical robots have demonstrated significant potential to standardize surgical outcomes, driving innovations that enhance safety and consistency regardless of individual surgeon experience. Deep anterior lamellar keratoplasty (DALK), a partial thickness corneal transplant surgery aimed at replacing the anterior part of cornea above Descemet membrane (DM), would greatly benefit from an… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14577v1-abstract-full').style.display = 'inline'; document.getElementById('2410.14577v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.14577v1-abstract-full" style="display: none;"> Autonomous surgical robots have demonstrated significant potential to standardize surgical outcomes, driving innovations that enhance safety and consistency regardless of individual surgeon experience. Deep anterior lamellar keratoplasty (DALK), a partial thickness corneal transplant surgery aimed at replacing the anterior part of cornea above Descemet membrane (DM), would greatly benefit from an autonomous surgical approach as it highly relies on surgeon skill with high perforation rates. In this study, we proposed a novel autonomous surgical robotic system (AUTO-DALK) based on a customized neural network capable of precise needle control and consistent big bubble demarcation on cadaver and live rabbit models. We demonstrate the feasibility of an AI-based image-guided vertical drilling approach for big bubble generation, in contrast to the conventional horizontal needle approach. Our system integrates an optical coherence tomography (OCT) fiber optic distal sensor into the eye-mountable micro robotic system, which automatically segments OCT M-mode depth signals to identify corneal layers using a custom deep learning algorithm. It enables the robot to autonomously guide the needle to targeted tissue layers via a depth-controlled feedback loop. We compared autonomous needle insertion performance and resulting pneumo-dissection using AUTO-DALK against 1) freehand insertion, 2) OCT sensor guided manual insertion, and 3) teleoperated robotic insertion, reporting significant improvements in insertion depth, pneumo-dissection depth, task completion time, and big bubble formation. Ex vivo and in vivo results indicate that the AI-driven, AUTO-DALK system, is a promising solution to standardize pneumo-dissection outcomes for partial thickness keratoplasty. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14577v1-abstract-full').style.display = 'none'; document.getElementById('2410.14577v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.07493">arXiv:2410.07493</a> <span> [<a href="https://arxiv.org/pdf/2410.07493">pdf</a>, <a href="https://arxiv.org/format/2410.07493">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Autonomous Robotic System with Optical Coherence Tomography Guidance for Vascular Anastomosis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Haworth%2C+J">Jesse Haworth</a>, <a href="/search/eess?searchtype=author&query=Biswas%2C+R">Rishi Biswas</a>, <a href="/search/eess?searchtype=author&query=Opfermann%2C+J">Justin Opfermann</a>, <a href="/search/eess?searchtype=author&query=Kam%2C+M">Michael Kam</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yaning Wang</a>, <a href="/search/eess?searchtype=author&query=Pantalone%2C+D">Desire Pantalone</a>, <a href="/search/eess?searchtype=author&query=Creighton%2C+F+X">Francis X. Creighton</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+R">Robin Yang</a>, <a href="/search/eess?searchtype=author&query=Kang%2C+J+U">Jin U. Kang</a>, <a href="/search/eess?searchtype=author&query=Krieger%2C+A">Axel Krieger</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.07493v1-abstract-short" style="display: inline;"> Vascular anastomosis, the surgical connection of blood vessels, is essential in procedures such as organ transplants and reconstructive surgeries. The precision required limits accessibility due to the extensive training needed, with manual suturing leading to variable outcomes and revision rates up to 7.9%. Existing robotic systems, while promising, are either fully teleoperated or lack the capab… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07493v1-abstract-full').style.display = 'inline'; document.getElementById('2410.07493v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.07493v1-abstract-full" style="display: none;"> Vascular anastomosis, the surgical connection of blood vessels, is essential in procedures such as organ transplants and reconstructive surgeries. The precision required limits accessibility due to the extensive training needed, with manual suturing leading to variable outcomes and revision rates up to 7.9%. Existing robotic systems, while promising, are either fully teleoperated or lack the capabilities necessary for autonomous vascular anastomosis. We present the Micro Smart Tissue Autonomous Robot (micro-STAR), an autonomous robotic system designed to perform vascular anastomosis on small-diameter vessels. The micro-STAR system integrates a novel suturing tool equipped with Optical Coherence Tomography (OCT) fiber-optic sensor and a microcamera, enabling real-time tissue detection and classification. Our system autonomously places sutures and manipulates tissue with minimal human intervention. In an ex vivo study, micro-STAR achieved outcomes competitive with experienced surgeons in terms of leak pressure, lumen reduction, and suture placement variation, completing 90% of sutures without human intervention. This represents the first instance of a robotic system autonomously performing vascular anastomosis on real tissue, offering significant potential for improving surgical precision and expanding access to high-quality care. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07493v1-abstract-full').style.display = 'none'; document.getElementById('2410.07493v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper was submitted to IEEE TMRB and is currently under review. There are 9 pages, 9 figures, and 2 tables</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68T40: Robotics </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.00667">arXiv:2410.00667</a> <span> [<a href="https://arxiv.org/pdf/2410.00667">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Classical Physics">physics.class-ph</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1016/j.jenvman.2024.123321">10.1016/j.jenvman.2024.123321 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Contribution of soundscape appropriateness to soundscape quality assessment in space: a mediating variable affecting acoustic comfort </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yang%2C+X">Xinhao Yang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+G">Guangyu Zhang</a>, <a href="/search/eess?searchtype=author&query=Lu%2C+X">Xiaodong Lu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Y">Yuan Zhang</a>, <a href="/search/eess?searchtype=author&query=Kang%2C+J">Jian Kang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.00667v2-abstract-short" style="display: inline;"> Soundscape appropriateness (SA) provides supplemental information on the matching degree between auditory information and the surrounding scene in soundscape perception. This indicator has been integrated into the standard ISO process for collecting soundscape data, forming a component of the sound quality assessment questionnaire. However, its role in soundscape quality assessment has not been fu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00667v2-abstract-full').style.display = 'inline'; document.getElementById('2410.00667v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.00667v2-abstract-full" style="display: none;"> Soundscape appropriateness (SA) provides supplemental information on the matching degree between auditory information and the surrounding scene in soundscape perception. This indicator has been integrated into the standard ISO process for collecting soundscape data, forming a component of the sound quality assessment questionnaire. However, its role in soundscape quality assessment has not been fully understood. Herein, we present the findings from soundscape data collected from Beiling Park in Shenyang, China. A method was developed that integrates mediation effect models with multiscale geographically weighted regression models to explore the mediating role of SA in the impact of sound source types on soundscape quality, as well as the spatial heterogeneity of this mediation effect. The results confirm that SA does mediates the influence of sound source types on acoustics comfort (AC). Specifically, natural sounds (indirect effect/total effect = .19/.19), traffic sounds (indirect effect/total effect = -.46/-.65), and commercial sounds (indirect effect/total effect = -.25/-.12) impact the perception of AC by either enhancing or reducing SA. Moreover, the relationships among variables depicted in this model demonstrate spatial heterogeneity, demonstrating that in urban open spaces with complex constructures, local spatial models may be needed for soundscape assessment. The research reaffirms the significance of SA in urban open spaces. In terms of practical implications for urban and landscape planners, when sound sources cannot be controlled or altered, coordinating between the sound and the surrounding environment through landscape optimisation could also improve the quality of the soundscape through enhancing SA and help achieve the goal of creating healthy urban open spaces. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00667v2-abstract-full').style.display = 'none'; document.getElementById('2410.00667v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by Journal of Environmental Management</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.00367">arXiv:2410.00367</a> <span> [<a href="https://arxiv.org/pdf/2410.00367">pdf</a>, <a href="https://arxiv.org/format/2410.00367">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> ROK Defense M&S in the Age of Hyperscale AI: Concepts, Challenges, and Future Directions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Lee%2C+Y">Youngjoon Lee</a>, <a href="/search/eess?searchtype=author&query=Park%2C+T">Taehyun Park</a>, <a href="/search/eess?searchtype=author&query=Kang%2C+Y">Yeongjoon Kang</a>, <a href="/search/eess?searchtype=author&query=Kim%2C+J">Jonghoe Kim</a>, <a href="/search/eess?searchtype=author&query=Kang%2C+J">Joonhyuk Kang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.00367v2-abstract-short" style="display: inline;"> Integrating hyperscale AI into national defense M&S(Modeling and Simulation), under the expanding IoMDT(Internet of Military Defense Things) framework, is crucial for boosting strategic and operational readiness. We examine how IoMDT-driven hyperscale AI can provide high accuracy, speed, and the ability to simulate complex, interconnected battlefield scenarios in defense M&S. Countries like the Un… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00367v2-abstract-full').style.display = 'inline'; document.getElementById('2410.00367v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.00367v2-abstract-full" style="display: none;"> Integrating hyperscale AI into national defense M&S(Modeling and Simulation), under the expanding IoMDT(Internet of Military Defense Things) framework, is crucial for boosting strategic and operational readiness. We examine how IoMDT-driven hyperscale AI can provide high accuracy, speed, and the ability to simulate complex, interconnected battlefield scenarios in defense M&S. Countries like the United States and China are leading the adoption of these technologies, with varying levels of success. However, realizing the full potential of hyperscale AI requires overcoming challenges such as closed networks, sparse or long-tail data, complex decision-making processes, and a shortage of experts. Future directions highlight the need to adopt domestic foundation models, expand GPU/NPU investments, leverage large tech services, and employ open source solutions. These efforts will enhance national security, maintain a competitive edge, and spur broader technological and economic growth. With this blueprint, the Republic of Korea can strengthen its defense posture and stay ahead of emerging threats in modern warfare. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00367v2-abstract-full').style.display = 'none'; document.getElementById('2410.00367v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to IEEE Internet of Things Magazine</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.16322">arXiv:2409.16322</a> <span> [<a href="https://arxiv.org/pdf/2409.16322">pdf</a>, <a href="https://arxiv.org/format/2409.16322">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neurons and Cognition">q-bio.NC</span> </div> </div> <p class="title is-5 mathjax"> Towards Within-Class Variation in Alzheimer's Disease Detection from Spontaneous Speech </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Kang%2C+J">Jiawen Kang</a>, <a href="/search/eess?searchtype=author&query=Han%2C+D">Dongrui Han</a>, <a href="/search/eess?searchtype=author&query=Meng%2C+L">Lingwei Meng</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+J">Jingyan Zhou</a>, <a href="/search/eess?searchtype=author&query=Li%2C+J">Jinchao Li</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+X">Xixin Wu</a>, <a href="/search/eess?searchtype=author&query=Meng%2C+H">Helen Meng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.16322v1-abstract-short" style="display: inline;"> Alzheimer's Disease (AD) detection has emerged as a promising research area that employs machine learning classification models to distinguish between individuals with AD and those without. Unlike conventional classification tasks, we identify within-class variation as a critical challenge in AD detection: individuals with AD exhibit a spectrum of cognitive impairments. Given that many AD detectio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.16322v1-abstract-full').style.display = 'inline'; document.getElementById('2409.16322v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.16322v1-abstract-full" style="display: none;"> Alzheimer's Disease (AD) detection has emerged as a promising research area that employs machine learning classification models to distinguish between individuals with AD and those without. Unlike conventional classification tasks, we identify within-class variation as a critical challenge in AD detection: individuals with AD exhibit a spectrum of cognitive impairments. Given that many AD detection tasks lack fine-grained labels, simplistic binary classification may overlook two crucial aspects: within-class differences and instance-level imbalance. The former compels the model to map AD samples with varying degrees of impairment to a single diagnostic label, disregarding certain changes in cognitive function. While the latter biases the model towards overrepresented severity levels. This work presents early efforts to address these challenges. We propose two novel methods: Soft Target Distillation (SoTD) and Instance-level Re-balancing (InRe), targeting two problems respectively. Experiments on the ADReSS and ADReSSo datasets demonstrate that the proposed methods significantly improve detection accuracy. Further analysis reveals that SoTD effectively harnesses the strengths of multiple component models, while InRe substantially alleviates model over-fitting. These findings provide insights for developing more robust and reliable AD detection models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.16322v1-abstract-full').style.display = 'none'; document.getElementById('2409.16322v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.12992">arXiv:2409.12992</a> <span> [<a href="https://arxiv.org/pdf/2409.12992">pdf</a>, <a href="https://arxiv.org/format/2409.12992">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> DiffEditor: Enhancing Speech Editing with Semantic Enrichment and Acoustic Consistency </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yang Chen</a>, <a href="/search/eess?searchtype=author&query=Jia%2C+Y">Yuhang Jia</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+S">Shiwan Zhao</a>, <a href="/search/eess?searchtype=author&query=Jiang%2C+Z">Ziyue Jiang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+H">Haoran Li</a>, <a href="/search/eess?searchtype=author&query=Kang%2C+J">Jiarong Kang</a>, <a href="/search/eess?searchtype=author&query=Qin%2C+Y">Yong Qin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.12992v1-abstract-short" style="display: inline;"> As text-based speech editing becomes increasingly prevalent, the demand for unrestricted free-text editing continues to grow. However, existing speech editing techniques encounter significant challenges, particularly in maintaining intelligibility and acoustic consistency when dealing with out-of-domain (OOD) text. In this paper, we introduce, DiffEditor, a novel speech editing model designed to e… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12992v1-abstract-full').style.display = 'inline'; document.getElementById('2409.12992v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.12992v1-abstract-full" style="display: none;"> As text-based speech editing becomes increasingly prevalent, the demand for unrestricted free-text editing continues to grow. However, existing speech editing techniques encounter significant challenges, particularly in maintaining intelligibility and acoustic consistency when dealing with out-of-domain (OOD) text. In this paper, we introduce, DiffEditor, a novel speech editing model designed to enhance performance in OOD text scenarios through semantic enrichment and acoustic consistency. To improve the intelligibility of the edited speech, we enrich the semantic information of phoneme embeddings by integrating word embeddings extracted from a pretrained language model. Furthermore, we emphasize that interframe smoothing properties are critical for modeling acoustic consistency, and thus we propose a first-order loss function to promote smoother transitions at editing boundaries and enhance the overall fluency of the edited speech. Experimental results demonstrate that our model achieves state-of-the-art performance in both in-domain and OOD text scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12992v1-abstract-full').style.display = 'none'; document.getElementById('2409.12992v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.12388">arXiv:2409.12388</a> <span> [<a href="https://arxiv.org/pdf/2409.12388">pdf</a>, <a href="https://arxiv.org/format/2409.12388">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Disentangling Speakers in Multi-Talker Speech Recognition with Speaker-Aware CTC </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Kang%2C+J">Jiawen Kang</a>, <a href="/search/eess?searchtype=author&query=Meng%2C+L">Lingwei Meng</a>, <a href="/search/eess?searchtype=author&query=Cui%2C+M">Mingyu Cui</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yuejiao Wang</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+X">Xixin Wu</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+X">Xunying Liu</a>, <a href="/search/eess?searchtype=author&query=Meng%2C+H">Helen Meng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.12388v2-abstract-short" style="display: inline;"> Multi-talker speech recognition (MTASR) faces unique challenges in disentangling and transcribing overlapping speech. To address these challenges, this paper investigates the role of Connectionist Temporal Classification (CTC) in speaker disentanglement when incorporated with Serialized Output Training (SOT) for MTASR. Our visualization reveals that CTC guides the encoder to represent different sp… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12388v2-abstract-full').style.display = 'inline'; document.getElementById('2409.12388v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.12388v2-abstract-full" style="display: none;"> Multi-talker speech recognition (MTASR) faces unique challenges in disentangling and transcribing overlapping speech. To address these challenges, this paper investigates the role of Connectionist Temporal Classification (CTC) in speaker disentanglement when incorporated with Serialized Output Training (SOT) for MTASR. Our visualization reveals that CTC guides the encoder to represent different speakers in distinct temporal regions of acoustic embeddings. Leveraging this insight, we propose a novel Speaker-Aware CTC (SACTC) training objective, based on the Bayes risk CTC framework. SACTC is a tailored CTC variant for multi-talker scenarios, it explicitly models speaker disentanglement by constraining the encoder to represent different speakers' tokens at specific time frames. When integrated with SOT, the SOT-SACTC model consistently outperforms standard SOT-CTC across various degrees of speech overlap. Specifically, we observe relative word error rate reductions of 10% overall and 15% on low-overlap speech. This work represents an initial exploration of CTC-based enhancements for MTASR tasks, offering a new perspective on speaker disentanglement in multi-talker speech recognition. The code is available at https://github.com/kjw11/Speaker-Aware-CTC. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12388v2-abstract-full').style.display = 'none'; document.getElementById('2409.12388v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICASSP2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.10072">arXiv:2409.10072</a> <span> [<a href="https://arxiv.org/pdf/2409.10072">pdf</a>, <a href="https://arxiv.org/format/2409.10072">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Speaker Contrastive Learning for Source Speaker Tracing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+Q">Qing Wang</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+H">Hongmei Guo</a>, <a href="/search/eess?searchtype=author&query=Kang%2C+J">Jian Kang</a>, <a href="/search/eess?searchtype=author&query=Du%2C+M">Mengjie Du</a>, <a href="/search/eess?searchtype=author&query=Li%2C+J">Jie Li</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+X">Xiao-Lei Zhang</a>, <a href="/search/eess?searchtype=author&query=Xie%2C+L">Lei Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.10072v1-abstract-short" style="display: inline;"> As a form of biometric authentication technology, the security of speaker verification systems is of utmost importance. However, SV systems are inherently vulnerable to various types of attacks that can compromise their accuracy and reliability. One such attack is voice conversion, which modifies a persons speech to sound like another person by altering various vocal characteristics. This poses a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10072v1-abstract-full').style.display = 'inline'; document.getElementById('2409.10072v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.10072v1-abstract-full" style="display: none;"> As a form of biometric authentication technology, the security of speaker verification systems is of utmost importance. However, SV systems are inherently vulnerable to various types of attacks that can compromise their accuracy and reliability. One such attack is voice conversion, which modifies a persons speech to sound like another person by altering various vocal characteristics. This poses a significant threat to SV systems. To address this challenge, the Source Speaker Tracing Challenge in IEEE SLT2024 aims to identify the source speaker information in manipulated speech signals. Specifically, SSTC focuses on source speaker verification against voice conversion to determine whether two converted speech samples originate from the same source speaker. In this study, we propose a speaker contrastive learning-based approach for source speaker tracing to learn the latent source speaker information in converted speech. To learn a more source-speaker-related representation, we employ speaker contrastive loss during the training of the embedding extractor. This speaker contrastive loss helps identify the true source speaker embedding among several distractor speaker embeddings, enabling the embedding extractor to learn the potentially possessing source speaker information present in the converted speech. Experiments demonstrate that our proposed speaker contrastive learning system achieves the lowest EER of 16.788% on the challenge test set, securing first place in the challenge. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10072v1-abstract-full').style.display = 'none'; document.getElementById('2409.10072v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages, 2 figures, accepted by SLT</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.08797">arXiv:2409.08797</a> <span> [<a href="https://arxiv.org/pdf/2409.08797">pdf</a>, <a href="https://arxiv.org/format/2409.08797">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Exploring SSL Discrete Speech Features for Zipformer-based Contextual ASR </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Cui%2C+M">Mingyu Cui</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+Y">Yifan Yang</a>, <a href="/search/eess?searchtype=author&query=Deng%2C+J">Jiajun Deng</a>, <a href="/search/eess?searchtype=author&query=Kang%2C+J">Jiawen Kang</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+S">Shujie Hu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+T">Tianzi Wang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Z">Zhaoqing Li</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+S">Shiliang Zhang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+X">Xie Chen</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+X">Xunying Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.08797v1-abstract-short" style="display: inline;"> Self-supervised learning (SSL) based discrete speech representations are highly compact and domain adaptable. In this paper, SSL discrete speech features extracted from WavLM models are used as additional cross-utterance acoustic context features in Zipformer-Transducer ASR systems. The efficacy of replacing Fbank features with discrete token features for modelling either cross-utterance contexts… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08797v1-abstract-full').style.display = 'inline'; document.getElementById('2409.08797v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.08797v1-abstract-full" style="display: none;"> Self-supervised learning (SSL) based discrete speech representations are highly compact and domain adaptable. In this paper, SSL discrete speech features extracted from WavLM models are used as additional cross-utterance acoustic context features in Zipformer-Transducer ASR systems. The efficacy of replacing Fbank features with discrete token features for modelling either cross-utterance contexts (from preceding and future segments), or current utterance's internal contexts alone, or both at the same time, are demonstrated thoroughly on the Gigaspeech 1000-hr corpus. The best Zipformer-Transducer system using discrete tokens based cross-utterance context features outperforms the baseline using utterance internal context only with statistically significant word error rate (WER) reductions of 0.32% to 0.41% absolute (2.78% to 3.54% relative) on the dev and test data. The lowest published WER of 11.15% and 11.14% were obtained on the dev and test sets. Our work is open-source and publicly available at https://github.com/open-creator/icefall/tree/master/egs/gigaspeech/Context\_ASR. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08797v1-abstract-full').style.display = 'none'; document.getElementById('2409.08797v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to ICASSP 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.08596">arXiv:2409.08596</a> <span> [<a href="https://arxiv.org/pdf/2409.08596">pdf</a>, <a href="https://arxiv.org/format/2409.08596">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Large Language Model Can Transcribe Speech in Multi-Talker Scenarios with Versatile Instructions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Meng%2C+L">Lingwei Meng</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+S">Shujie Hu</a>, <a href="/search/eess?searchtype=author&query=Kang%2C+J">Jiawen Kang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Z">Zhaoqing Li</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yuejiao Wang</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+W">Wenxuan Wu</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+X">Xixin Wu</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+X">Xunying Liu</a>, <a href="/search/eess?searchtype=author&query=Meng%2C+H">Helen Meng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.08596v1-abstract-short" style="display: inline;"> Recent advancements in large language models (LLMs) have revolutionized various domains, bringing significant progress and new opportunities. Despite progress in speech-related tasks, LLMs have not been sufficiently explored in multi-talker scenarios. In this work, we present a pioneering effort to investigate the capability of LLMs in transcribing speech in multi-talker environments, following ve… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08596v1-abstract-full').style.display = 'inline'; document.getElementById('2409.08596v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.08596v1-abstract-full" style="display: none;"> Recent advancements in large language models (LLMs) have revolutionized various domains, bringing significant progress and new opportunities. Despite progress in speech-related tasks, LLMs have not been sufficiently explored in multi-talker scenarios. In this work, we present a pioneering effort to investigate the capability of LLMs in transcribing speech in multi-talker environments, following versatile instructions related to multi-talker automatic speech recognition (ASR), target talker ASR, and ASR based on specific talker attributes such as sex, occurrence order, language, and keyword spoken. Our approach utilizes WavLM and Whisper encoder to extract multi-faceted speech representations that are sensitive to speaker characteristics and semantic context. These representations are then fed into an LLM fine-tuned using LoRA, enabling the capabilities for speech comprehension and transcription. Comprehensive experiments reveal the promising performance of our proposed system, MT-LLM, in cocktail party scenarios, highlighting the potential of LLM to handle speech-related tasks based on user instructions in such complex settings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08596v1-abstract-full').style.display = 'none'; document.getElementById('2409.08596v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.08849">arXiv:2408.08849</a> <span> [<a href="https://arxiv.org/pdf/2408.08849">pdf</a>, <a href="https://arxiv.org/format/2408.08849">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> ECG-Chat: A Large ECG-Language Model for Cardiac Disease Diagnosis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhao%2C+Y">Yubao Zhao</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+T">Tian Zhang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+X">Xu Wang</a>, <a href="/search/eess?searchtype=author&query=Han%2C+P">Puyu Han</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+T">Tong Chen</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+L">Linlin Huang</a>, <a href="/search/eess?searchtype=author&query=Jin%2C+Y">Youzhu Jin</a>, <a href="/search/eess?searchtype=author&query=Kang%2C+J">Jiaju Kang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.08849v1-abstract-short" style="display: inline;"> The success of Multimodal Large Language Models (MLLMs) in the medical auxiliary field shows great potential, allowing patients to engage in conversations using physiological signal data. However, general MLLMs perform poorly in cardiac disease diagnosis, particularly in the integration of ECG data analysis and long-text medical report generation, mainly due to the complexity of ECG data analysis… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.08849v1-abstract-full').style.display = 'inline'; document.getElementById('2408.08849v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.08849v1-abstract-full" style="display: none;"> The success of Multimodal Large Language Models (MLLMs) in the medical auxiliary field shows great potential, allowing patients to engage in conversations using physiological signal data. However, general MLLMs perform poorly in cardiac disease diagnosis, particularly in the integration of ECG data analysis and long-text medical report generation, mainly due to the complexity of ECG data analysis and the gap between text and ECG signal modalities. Additionally, models often exhibit severe stability deficiencies in long-text generation due to the lack of precise knowledge strongly related to user queries. To address these issues, we propose ECG-Chat, the first multitask MLLMs focused on ECG medical report generation, providing multimodal conversational capabilities based on cardiology knowledge. We propose a contrastive learning approach that integrates ECG waveform data with text reports, aligning ECG features with reports in a fine-grained manner. This method also results in an ECG encoder that excels in zero-shot report retrieval tasks. Additionally, expanding existing datasets, we constructed a 19k ECG diagnosis dataset and a 25k multi-turn dialogue dataset for training and fine-tuning ECG-Chat, which provides professional diagnostic and conversational capabilities. Furthermore, ECG-Chat can generate comprehensive ECG analysis reports through an automated LaTeX generation pipeline. We established a benchmark for the ECG report generation task and tested our model on multiple baselines. ECG-Chat achieved the best performance in classification, retrieval, multimodal dialogue, and medical report generation tasks. Our report template design has also been widely recognized by medical practitioners. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.08849v1-abstract-full').style.display = 'none'; document.getElementById('2408.08849v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.09817">arXiv:2407.09817</a> <span> [<a href="https://arxiv.org/pdf/2407.09817">pdf</a>, <a href="https://arxiv.org/format/2407.09817">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Empowering Whisper as a Joint Multi-Talker and Target-Talker Speech Recognition System </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Meng%2C+L">Lingwei Meng</a>, <a href="/search/eess?searchtype=author&query=Kang%2C+J">Jiawen Kang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yuejiao Wang</a>, <a href="/search/eess?searchtype=author&query=Jin%2C+Z">Zengrui Jin</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+X">Xixin Wu</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+X">Xunying Liu</a>, <a href="/search/eess?searchtype=author&query=Meng%2C+H">Helen Meng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.09817v2-abstract-short" style="display: inline;"> Multi-talker speech recognition and target-talker speech recognition, both involve transcription in multi-talker contexts, remain significant challenges. However, existing methods rarely attempt to simultaneously address both tasks. In this study, we propose a pioneering approach to empower Whisper, which is a speech foundation model, to tackle joint multi-talker and target-talker speech recogniti… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.09817v2-abstract-full').style.display = 'inline'; document.getElementById('2407.09817v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.09817v2-abstract-full" style="display: none;"> Multi-talker speech recognition and target-talker speech recognition, both involve transcription in multi-talker contexts, remain significant challenges. However, existing methods rarely attempt to simultaneously address both tasks. In this study, we propose a pioneering approach to empower Whisper, which is a speech foundation model, to tackle joint multi-talker and target-talker speech recognition tasks. Specifically, (i) we freeze Whisper and plug a Sidecar separator into its encoder to separate mixed embedding for multiple talkers; (ii) a Target Talker Identifier is introduced to identify the embedding flow of the target talker on the fly, requiring only three-second enrollment speech as a cue; (iii) soft prompt tuning for decoder is explored for better task adaptation. Our method outperforms previous methods on two- and three-talker LibriMix and LibriSpeechMix datasets for both tasks, and delivers acceptable zero-shot performance on multi-talker ASR on AishellMix Mandarin dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.09817v2-abstract-full').style.display = 'none'; document.getElementById('2407.09817v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to INTERSPEECH 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.05744">arXiv:2407.05744</a> <span> [<a href="https://arxiv.org/pdf/2407.05744">pdf</a>, <a href="https://arxiv.org/format/2407.05744">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1016/j.buildenv.2024.112106">10.1016/j.buildenv.2024.112106 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Automating Urban Soundscape Enhancements with AI: In-situ Assessment of Quality and Restorativeness in Traffic-Exposed Residential Areas </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Lam%2C+B">Bhan Lam</a>, <a href="/search/eess?searchtype=author&query=Ong%2C+Z">Zhen-Ting Ong</a>, <a href="/search/eess?searchtype=author&query=Ooi%2C+K">Kenneth Ooi</a>, <a href="/search/eess?searchtype=author&query=Ong%2C+W">Wen-Hui Ong</a>, <a href="/search/eess?searchtype=author&query=Wong%2C+T">Trevor Wong</a>, <a href="/search/eess?searchtype=author&query=Watcharasupat%2C+K+N">Karn N. Watcharasupat</a>, <a href="/search/eess?searchtype=author&query=Boey%2C+V">Vanessa Boey</a>, <a href="/search/eess?searchtype=author&query=Lee%2C+I">Irene Lee</a>, <a href="/search/eess?searchtype=author&query=Hong%2C+J+Y">Joo Young Hong</a>, <a href="/search/eess?searchtype=author&query=Kang%2C+J">Jian Kang</a>, <a href="/search/eess?searchtype=author&query=Lee%2C+K+F+A">Kar Fye Alvin Lee</a>, <a href="/search/eess?searchtype=author&query=Christopoulos%2C+G">Georgios Christopoulos</a>, <a href="/search/eess?searchtype=author&query=Gan%2C+W">Woon-Seng Gan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.05744v2-abstract-short" style="display: inline;"> Formalized in ISO 12913, the "soundscape" approach is a paradigmatic shift towards perception-based urban sound management, aiming to alleviate the substantial socioeconomic costs of noise pollution to advance the United Nations Sustainable Development Goals. Focusing on traffic-exposed outdoor residential sites, we implemented an automatic masker selection system (AMSS) utilizing natural sounds t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.05744v2-abstract-full').style.display = 'inline'; document.getElementById('2407.05744v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.05744v2-abstract-full" style="display: none;"> Formalized in ISO 12913, the "soundscape" approach is a paradigmatic shift towards perception-based urban sound management, aiming to alleviate the substantial socioeconomic costs of noise pollution to advance the United Nations Sustainable Development Goals. Focusing on traffic-exposed outdoor residential sites, we implemented an automatic masker selection system (AMSS) utilizing natural sounds to mask (or augment) traffic soundscapes. We employed a pre-trained AI model to automatically select the optimal masker and adjust its playback level, adapting to changes over time in the ambient environment to maximize "Pleasantness", a perceptual dimension of soundscape quality in ISO 12913. Our validation study involving ($N=68$) residents revealed a significant 14.6 % enhancement in "Pleasantness" after intervention, correlating with increased restorativeness and positive affect. Perceptual enhancements at the traffic-exposed site matched those at a quieter control site with 6 dB(A) lower $L_\text{A,eq}$ and road traffic noise dominance, affirming the efficacy of AMSS as a soundscape intervention, while streamlining the labour-intensive assessment of "Pleasantness" with probabilistic AI prediction. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.05744v2-abstract-full').style.display = 'none'; document.getElementById('2407.05744v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">41 pages, 4 figures. Preprint submitted to Building and Environment</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Building and Environment, vol. 266, p. 112106, Dec. 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.14333">arXiv:2406.14333</a> <span> [<a href="https://arxiv.org/pdf/2406.14333">pdf</a>, <a href="https://arxiv.org/format/2406.14333">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> LARP: Language Audio Relational Pre-training for Cold-Start Playlist Continuation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Salganik%2C+R">Rebecca Salganik</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+X">Xiaohao Liu</a>, <a href="/search/eess?searchtype=author&query=Ma%2C+Y">Yunshan Ma</a>, <a href="/search/eess?searchtype=author&query=Kang%2C+J">Jian Kang</a>, <a href="/search/eess?searchtype=author&query=Chua%2C+T">Tat-Seng Chua</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.14333v1-abstract-short" style="display: inline;"> As online music consumption increasingly shifts towards playlist-based listening, the task of playlist continuation, in which an algorithm suggests songs to extend a playlist in a personalized and musically cohesive manner, has become vital to the success of music streaming. Currently, many existing playlist continuation approaches rely on collaborative filtering methods to perform recommendation.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.14333v1-abstract-full').style.display = 'inline'; document.getElementById('2406.14333v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.14333v1-abstract-full" style="display: none;"> As online music consumption increasingly shifts towards playlist-based listening, the task of playlist continuation, in which an algorithm suggests songs to extend a playlist in a personalized and musically cohesive manner, has become vital to the success of music streaming. Currently, many existing playlist continuation approaches rely on collaborative filtering methods to perform recommendation. However, such methods will struggle to recommend songs that lack interaction data, an issue known as the cold-start problem. Current approaches to this challenge design complex mechanisms for extracting relational signals from sparse collaborative data and integrating them into content representations. However, these approaches leave content representation learning out of scope and utilize frozen, pre-trained content models that may not be aligned with the distribution or format of a specific musical setting. Furthermore, even the musical state-of-the-art content modules are either (1) incompatible with the cold-start setting or (2) unable to effectively integrate cross-modal and relational signals. In this paper, we introduce LARP, a multi-modal cold-start playlist continuation model, to effectively overcome these limitations. LARP is a three-stage contrastive learning framework that integrates both multi-modal and relational signals into its learned representations. Our framework uses increasing stages of task-specific abstraction: within-track (language-audio) contrastive loss, track-track contrastive loss, and track-playlist contrastive loss. Experimental results on two publicly available datasets demonstrate the efficacy of LARP over uni-modal and multi-modal models for playlist continuation in a cold-start setting. Code and dataset are released at: https://github.com/Rsalganik1123/LARP. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.14333v1-abstract-full').style.display = 'none'; document.getElementById('2406.14333v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.11918">arXiv:2406.11918</a> <span> [<a href="https://arxiv.org/pdf/2406.11918">pdf</a>, <a href="https://arxiv.org/format/2406.11918">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> QoE Maximization for Multiple-UAV-Assisted Multi-Access Edge Computing: An Online Joint Optimization Approach </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=He%2C+L">Long He</a>, <a href="/search/eess?searchtype=author&query=Sun%2C+G">Geng Sun</a>, <a href="/search/eess?searchtype=author&query=Sun%2C+Z">Zemin Sun</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+Q">Qingqing Wu</a>, <a href="/search/eess?searchtype=author&query=Kang%2C+J">Jiawen Kang</a>, <a href="/search/eess?searchtype=author&query=Niyato%2C+D">Dusit Niyato</a>, <a href="/search/eess?searchtype=author&query=Han%2C+Z">Zhu Han</a>, <a href="/search/eess?searchtype=author&query=Leung%2C+V+C+M">Victor C. M. Leung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.11918v1-abstract-short" style="display: inline;"> In disaster scenarios, conventional terrestrial multi-access edge computing (MEC) paradigms, which rely on fixed infrastructure, may become unavailable due to infrastructure damage. With high-probability line-of-sight (LoS) communication, flexible mobility, and low cost, unmanned aerial vehicle (UAV)-assisted MEC is emerging as a new promising paradigm to provide edge computing services for ground… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.11918v1-abstract-full').style.display = 'inline'; document.getElementById('2406.11918v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.11918v1-abstract-full" style="display: none;"> In disaster scenarios, conventional terrestrial multi-access edge computing (MEC) paradigms, which rely on fixed infrastructure, may become unavailable due to infrastructure damage. With high-probability line-of-sight (LoS) communication, flexible mobility, and low cost, unmanned aerial vehicle (UAV)-assisted MEC is emerging as a new promising paradigm to provide edge computing services for ground user devices (UDs) in disaster-stricken areas. However, the limited battery capacity, computing resources, and spectrum resources also pose serious challenges for UAV-assisted MEC, which can potentially shorten the service time of UAVs and degrade the quality of experience (QoE) of UDs without an effective control approach. To this end, in this work, we first present a hierarchical architecture of multiple-UAV-assisted MEC networks that enables the coordinated provision of edge computing services by multiple UAVs. Then, we formulate a joint task offloading, resource allocation, and UAV trajectory planning optimization problem (JTRTOP) to maximize the QoE of UDs while considering the energy consumption constraints of UAVs. Since the problem is proven to be a future-dependent and NP-hard problem, we propose a novel online joint task offloading, resource allocation, and UAV trajectory planning approach (OJTRTA) to solve the problem. Specifically, the JTRTOP is first transformed into a per-slot real-time optimization problem (PROP) using the Lyapunov optimization framework. Then, a two-stage optimization method based on game theory and convex optimization is proposed to solve the PROP. Simulation results provide empirical evidence supporting the superior system performance of the proposed OJTRTA in comparison to alternative approaches. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.11918v1-abstract-full').style.display = 'none'; document.getElementById('2406.11918v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.08809">arXiv:2406.08809</a> <span> [<a href="https://arxiv.org/pdf/2406.08809">pdf</a>, <a href="https://arxiv.org/format/2406.08809">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Are We There Yet? A Brief Survey of Music Emotion Prediction Datasets, Models and Outstanding Challenges </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Kang%2C+J">Jaeyong Kang</a>, <a href="/search/eess?searchtype=author&query=Herremans%2C+D">Dorien Herremans</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.08809v2-abstract-short" style="display: inline;"> Deep learning models for music have advanced drastically in recent years, but how good are machine learning models at capturing emotion, and what challenges are researchers facing? In this paper, we provide a comprehensive overview of the available music-emotion datasets and discuss evaluation standards as well as competitions in the field. We also offer a brief overview of various types of music… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.08809v2-abstract-full').style.display = 'inline'; document.getElementById('2406.08809v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.08809v2-abstract-full" style="display: none;"> Deep learning models for music have advanced drastically in recent years, but how good are machine learning models at capturing emotion, and what challenges are researchers facing? In this paper, we provide a comprehensive overview of the available music-emotion datasets and discuss evaluation standards as well as competitions in the field. We also offer a brief overview of various types of music emotion prediction models that have been built over the years, providing insights into the diverse approaches within the field. Through this examination, we highlight the challenges that persist in accurately capturing emotion in music, including issues related to dataset quality, annotation consistency, and model generalization. Additionally, we explore the impact of different modalities, such as audio, MIDI, and physiological signals, on the effectiveness of emotion prediction models. Recognizing the dynamic nature of this field, we have complemented our findings with an accompanying GitHub repository. This repository contains a comprehensive list of music emotion datasets and recent predictive models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.08809v2-abstract-full').style.display = 'none'; document.getElementById('2406.08809v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.05914">arXiv:2406.05914</a> <span> [<a href="https://arxiv.org/pdf/2406.05914">pdf</a>, <a href="https://arxiv.org/format/2406.05914">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Soundscape Captioning using Sound Affective Quality Network and Large Language Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Hou%2C+Y">Yuanbo Hou</a>, <a href="/search/eess?searchtype=author&query=Ren%2C+Q">Qiaoqiao Ren</a>, <a href="/search/eess?searchtype=author&query=Mitchell%2C+A">Andrew Mitchell</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+W">Wenwu Wang</a>, <a href="/search/eess?searchtype=author&query=Kang%2C+J">Jian Kang</a>, <a href="/search/eess?searchtype=author&query=Belpaeme%2C+T">Tony Belpaeme</a>, <a href="/search/eess?searchtype=author&query=Botteldooren%2C+D">Dick Botteldooren</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.05914v2-abstract-short" style="display: inline;"> We live in a rich and varied acoustic world, which is experienced by individuals or communities as a soundscape. Computational auditory scene analysis, disentangling acoustic scenes by detecting and classifying events, focuses on objective attributes of sounds, such as their category and temporal characteristics, ignoring their effects on people, such as the emotions they evoke within a context. T… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.05914v2-abstract-full').style.display = 'inline'; document.getElementById('2406.05914v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.05914v2-abstract-full" style="display: none;"> We live in a rich and varied acoustic world, which is experienced by individuals or communities as a soundscape. Computational auditory scene analysis, disentangling acoustic scenes by detecting and classifying events, focuses on objective attributes of sounds, such as their category and temporal characteristics, ignoring their effects on people, such as the emotions they evoke within a context. To fill this gap, we propose the soundscape captioning task, which enables automated soundscape analysis, thus avoiding labour-intensive subjective ratings and surveys in conventional methods. With soundscape captioning, context-aware descriptions are generated for soundscape by capturing the acoustic scene, event information, and the corresponding human affective qualities (AQs). To this end, we propose an automatic soundscape captioner (SoundSCaper) system composed of an acoustic model, i.e. SoundAQnet, and a large language model (LLM). SoundAQnet simultaneously models multi-scale information about acoustic scenes, events, and perceived AQs, while the LLM describes the soundscape with captions by parsing the information captured with SoundAQnet. The soundscape caption's quality is assessed by a jury of 16 audio/soundscape experts. The average score (out of 5) of SoundSCaper-generated captions is lower than the score of captions generated by two soundscape experts by 0.21 and 0.25, respectively, on the evaluation set and the model-unknown mixed external dataset with varying lengths and acoustic properties, but the differences are not statistically significant. Overall, the proposed SoundSCaper shows promising performance, with captions generated being comparable to those annotated by soundscape experts. The code of models, LLM scripts, human assessment data and instructions, and expert evaluation statistics are all publicly available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.05914v2-abstract-full').style.display = 'none'; document.getElementById('2406.05914v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Code: https://github.com/Yuanbo2020/SoundSCaper</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.00408">arXiv:2406.00408</a> <span> [<a href="https://arxiv.org/pdf/2406.00408">pdf</a>, <a href="https://arxiv.org/format/2406.00408">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Optimizing 6G Integrated Sensing and Communications (ISAC) via Expert Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+J">Jiacheng Wang</a>, <a href="/search/eess?searchtype=author&query=Du%2C+H">Hongyang Du</a>, <a href="/search/eess?searchtype=author&query=Sun%2C+G">Geng Sun</a>, <a href="/search/eess?searchtype=author&query=Kang%2C+J">Jiawen Kang</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+H">Haibo Zhou</a>, <a href="/search/eess?searchtype=author&query=Niyato%2C+D">Dusit Niyato</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+J">Jiming Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.00408v1-abstract-short" style="display: inline;"> Integrated Sensing and Communications (ISAC) is one of the core technologies of 6G, which combines sensing and communication functions into a single system. However, limited computing and storage resources make it impractical to combine multiple sensing models into a single device, constraining the system's function and performance. Therefore, this article proposes enhancing ISAC with the mixture… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.00408v1-abstract-full').style.display = 'inline'; document.getElementById('2406.00408v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.00408v1-abstract-full" style="display: none;"> Integrated Sensing and Communications (ISAC) is one of the core technologies of 6G, which combines sensing and communication functions into a single system. However, limited computing and storage resources make it impractical to combine multiple sensing models into a single device, constraining the system's function and performance. Therefore, this article proposes enhancing ISAC with the mixture of experts (MoE) architecture. Rigorously, we first investigate ISAC and MoE, including their concepts, advantages, and applications. Then, we explore how MoE can enhance ISAC from the perspectives of signal processing and network optimization. Building on this, we propose an MoE based ISAC framework, which uses a gating network to selectively activate multiple experts in handling sensing tasks under given communication conditions, thereby improving the overall performance. The case study demonstrates that the proposed framework can effectively increase the accuracy and robustness in detecting targets by using wireless communication signal, providing strong support for the practical deployment and applications of the ISAC system. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.00408v1-abstract-full').style.display = 'none'; document.getElementById('2406.00408v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.20168">arXiv:2405.20168</a> <span> [<a href="https://arxiv.org/pdf/2405.20168">pdf</a>, <a href="https://arxiv.org/format/2405.20168">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/MILCOM61039.2024.10774028">10.1109/MILCOM61039.2024.10774028 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Enhancing Battlefield Awareness: An Aerial RIS-assisted ISAC System with Deep Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Cho%2C+H">Hyunsang Cho</a>, <a href="/search/eess?searchtype=author&query=Yoo%2C+S">Seonghoon Yoo</a>, <a href="/search/eess?searchtype=author&query=Jung%2C+B+C">Bang Chul Jung</a>, <a href="/search/eess?searchtype=author&query=Kang%2C+J">Joonhyuk Kang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.20168v1-abstract-short" style="display: inline;"> This paper considers a joint communication and sensing technique for enhancing situational awareness in practical battlefield scenarios. In particular, we propose an aerial reconfigurable intelligent surface (ARIS)-assisted integrated sensing and communication (ISAC) system consisting of a single access point (AP), an ARIS, multiple users, and a sensing target. With deep reinforcement learning (DR… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.20168v1-abstract-full').style.display = 'inline'; document.getElementById('2405.20168v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.20168v1-abstract-full" style="display: none;"> This paper considers a joint communication and sensing technique for enhancing situational awareness in practical battlefield scenarios. In particular, we propose an aerial reconfigurable intelligent surface (ARIS)-assisted integrated sensing and communication (ISAC) system consisting of a single access point (AP), an ARIS, multiple users, and a sensing target. With deep reinforcement learning (DRL), we jointly optimize the transmit beamforming of the AP, the RIS phase shifts, and the trajectory of the ARIS under signal-to-interference-noise ratio (SINR) constraints. Numerical results demonstrate that the proposed technique outperforms the conventional benchmark schemes by suppressing the self-interference and clutter echo signals or optimizing the RIS phase shifts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.20168v1-abstract-full').style.display = 'none'; document.getElementById('2405.20168v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.18701">arXiv:2405.18701</a> <span> [<a href="https://arxiv.org/pdf/2405.18701">pdf</a>, <a href="https://arxiv.org/format/2405.18701">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Near-Field Localization with RIS via Two-Dimensional Signal Path Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Kang%2C+J">Jeongwan Kang</a>, <a href="/search/eess?searchtype=author&query=Ko%2C+S">Seung-Woo Ko</a>, <a href="/search/eess?searchtype=author&query=Kim%2C+S">Sunwoo Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.18701v1-abstract-short" style="display: inline;"> In this paper, we propose two-dimensional signal path classification (2D-SPC) for reconfigurable intelligent surface (RIS)-assisted near-field (NF) localization. In the NF regime, multiple RIS-driven signal paths (SPs) can contribute to precise localization if these are decomposable and the reflected locations on the RIS are known, referred to as SP decomposition (SPD) and SP labeling (SPL), respe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.18701v1-abstract-full').style.display = 'inline'; document.getElementById('2405.18701v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.18701v1-abstract-full" style="display: none;"> In this paper, we propose two-dimensional signal path classification (2D-SPC) for reconfigurable intelligent surface (RIS)-assisted near-field (NF) localization. In the NF regime, multiple RIS-driven signal paths (SPs) can contribute to precise localization if these are decomposable and the reflected locations on the RIS are known, referred to as SP decomposition (SPD) and SP labeling (SPL), respectively. To this end, each RIS element modulates the incoming SP's phase by shifting it by one of the values in the phase shift profile (PSP) lists satisfying resolution requirements. By interworking with a conventional orthogonal frequency division multiplexing (OFDM) waveform, the user equipment can construct a 2D spectrum map that couples each SPs time of arrival (ToA) and PSP. Then, we design SPL by mapping SPs with the corresponding reflected RIS elements when they share the same PSP. Given two unlabeled SPs, we derive a geometric discriminant from checking whether the current label is correct. It can be extended to more than three SPs by sorting them using pairwise geometric discriminants between adjacent ones. From simulation results, it has been demonstrated that the proposed 2D SPC achieves consistent localization accuracy even if insufficient PSPs are given. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.18701v1-abstract-full').style.display = 'none'; document.getElementById('2405.18701v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15pages, 12figures, Submitted to IEEE Transactions on Wireless Communications</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.17071">arXiv:2405.17071</a> <span> [<a href="https://arxiv.org/pdf/2405.17071">pdf</a>, <a href="https://arxiv.org/format/2405.17071">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Reliable Sub-Nyquist Spectrum Sensing via Conformal Risk Control </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Lee%2C+H">Hyojin Lee</a>, <a href="/search/eess?searchtype=author&query=Park%2C+S">Sangwoo Park</a>, <a href="/search/eess?searchtype=author&query=Simeone%2C+O">Osvaldo Simeone</a>, <a href="/search/eess?searchtype=author&query=Eldar%2C+Y+C">Yonina C. Eldar</a>, <a href="/search/eess?searchtype=author&query=Kang%2C+J">Joonhyuk Kang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.17071v1-abstract-short" style="display: inline;"> Detecting occupied subbands is a key task for wireless applications such as unlicensed spectrum access. Recently, detection methods were proposed that extract per-subband features from sub-Nyquist baseband samples and then apply thresholding mechanisms based on held-out data. Such existing solutions can only provide guarantees in terms of false negative rate (FNR) in the asymptotic regime of large… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.17071v1-abstract-full').style.display = 'inline'; document.getElementById('2405.17071v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.17071v1-abstract-full" style="display: none;"> Detecting occupied subbands is a key task for wireless applications such as unlicensed spectrum access. Recently, detection methods were proposed that extract per-subband features from sub-Nyquist baseband samples and then apply thresholding mechanisms based on held-out data. Such existing solutions can only provide guarantees in terms of false negative rate (FNR) in the asymptotic regime of large held-out data sets. In contrast, this work proposes a threshold mechanism-based conformal risk control (CRC), a method recently introduced in statistics. The proposed CRC-based thresholding technique formally meets user-specified FNR constraints, irrespective of the size of the held-out data set. By applying the proposed CRC-based framework to both reconstruction-based and classification-based sub-Nyquist spectrum sensing techniques, it is verified via experimental results that CRC not only provides theoretical guarantees on the FNR but also offers competitive true negative rate (TNR) performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.17071v1-abstract-full').style.display = 'none'; document.getElementById('2405.17071v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">submitted for a journal publication</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.14222">arXiv:2405.14222</a> <span> [<a href="https://arxiv.org/pdf/2405.14222">pdf</a>, <a href="https://arxiv.org/format/2405.14222">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Rate-Adaptive Quantization: A Multi-Rate Codebook Adaptation for Vector Quantization-based Generative Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Seo%2C+J">Jiwan Seo</a>, <a href="/search/eess?searchtype=author&query=Kang%2C+J">Joonhyuk Kang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.14222v2-abstract-short" style="display: inline;"> Learning discrete representations with vector quantization (VQ) has emerged as a powerful approach in various generative models. However, most VQ-based models rely on a single, fixed-rate codebook, requiring extensive retraining for new bitrates or efficiency requirements. We introduce Rate-Adaptive Quantization (RAQ), a multi-rate codebook adaptation framework for VQ-based generative models. RAQ… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.14222v2-abstract-full').style.display = 'inline'; document.getElementById('2405.14222v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.14222v2-abstract-full" style="display: none;"> Learning discrete representations with vector quantization (VQ) has emerged as a powerful approach in various generative models. However, most VQ-based models rely on a single, fixed-rate codebook, requiring extensive retraining for new bitrates or efficiency requirements. We introduce Rate-Adaptive Quantization (RAQ), a multi-rate codebook adaptation framework for VQ-based generative models. RAQ applies a data-driven approach to generate variable-rate codebooks from a single baseline VQ model, enabling flexible tradeoffs between compression and reconstruction fidelity. Additionally, we provide a simple clustering-based procedure for pre-trained VQ models, offering an alternative when retraining is infeasible. Our experiments show that RAQ performs effectively across multiple rates, often outperforming conventional fixed-rate VQ baselines. By enabling a single system to seamlessly handle diverse bitrate requirements, RAQ extends the adaptability of VQ-based generative models and broadens their applicability to data compression, reconstruction, and generation tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.14222v2-abstract-full').style.display = 'none'; document.getElementById('2405.14222v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Under review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.15292">arXiv:2404.15292</a> <span> [<a href="https://arxiv.org/pdf/2404.15292">pdf</a>, <a href="https://arxiv.org/format/2404.15292">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> Multi-objective Optimization for Multi-UAV-assisted Mobile Edge Computing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Sun%2C+G">Geng Sun</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yixian Wang</a>, <a href="/search/eess?searchtype=author&query=Sun%2C+Z">Zemin Sun</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+Q">Qingqing Wu</a>, <a href="/search/eess?searchtype=author&query=Kang%2C+J">Jiawen Kang</a>, <a href="/search/eess?searchtype=author&query=Niyato%2C+D">Dusit Niyato</a>, <a href="/search/eess?searchtype=author&query=Leung%2C+V+C+M">Victor C. M. Leung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.15292v1-abstract-short" style="display: inline;"> Recent developments in unmanned aerial vehicles (UAVs) and mobile edge computing (MEC) have provided users with flexible and resilient computing services. However, meeting the computing-intensive and latency-sensitive demands of users poses a significant challenge due to the limited resources of UAVs. To address this challenge, we present a multi-objective optimization approach for multi-UAV-assis… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.15292v1-abstract-full').style.display = 'inline'; document.getElementById('2404.15292v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.15292v1-abstract-full" style="display: none;"> Recent developments in unmanned aerial vehicles (UAVs) and mobile edge computing (MEC) have provided users with flexible and resilient computing services. However, meeting the computing-intensive and latency-sensitive demands of users poses a significant challenge due to the limited resources of UAVs. To address this challenge, we present a multi-objective optimization approach for multi-UAV-assisted MEC systems. First, we formulate a multi-objective optimization problem \textcolor{b2}{aiming} at minimizing the total task completion delay, reducing the total UAV energy consumption, and maximizing the total amount of offloaded tasks by jointly optimizing task offloading, computation resource allocation, and UAV trajectory control. Since the problem is a mixed-integer non-linear programming (MINLP) and NP-hard problem which is challenging, we propose a joint task offloading, computation resource allocation, and UAV trajectory control (JTORATC) approach to solve the problem. \textcolor{b3}{However, since the decision variables of task offloading, computation resource allocation, and UAV trajectory control are coupled with each other, the original problem is split into three sub-problems, i.e., task offloading, computation resource allocation, and UAV trajectory control, which are solved individually to obtain the corresponding decisions.} \textcolor{b2}{Moreover, the sub-problem of task offloading is solved by using distributed splitting and threshold rounding methods, the sub-problem of computation resource allocation is solved by adopting the Karush-Kuhn-Tucker (KKT) method, and the sub-problem of UAV trajectory control is solved by employing the successive convex approximation (SCA) method.} Simulation results show that the proposed JTORATC has superior performance compared to the other benchmark methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.15292v1-abstract-full').style.display = 'none'; document.getElementById('2404.15292v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.14140">arXiv:2404.14140</a> <span> [<a href="https://arxiv.org/pdf/2404.14140">pdf</a>, <a href="https://arxiv.org/format/2404.14140">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Generative Artificial Intelligence Assisted Wireless Sensing: Human Flow Detection in Practical Communication Environments </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+J">Jiacheng Wang</a>, <a href="/search/eess?searchtype=author&query=Du%2C+H">Hongyang Du</a>, <a href="/search/eess?searchtype=author&query=Niyato%2C+D">Dusit Niyato</a>, <a href="/search/eess?searchtype=author&query=Xiong%2C+Z">Zehui Xiong</a>, <a href="/search/eess?searchtype=author&query=Kang%2C+J">Jiawen Kang</a>, <a href="/search/eess?searchtype=author&query=Ai%2C+B">Bo Ai</a>, <a href="/search/eess?searchtype=author&query=Han%2C+Z">Zhu Han</a>, <a href="/search/eess?searchtype=author&query=Kim%2C+D+I">Dong In Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.14140v1-abstract-short" style="display: inline;"> Groundbreaking applications such as ChatGPT have heightened research interest in generative artificial intelligence (GAI). Essentially, GAI excels not only in content generation but also in signal processing, offering support for wireless sensing. Hence, we introduce a novel GAI-assisted human flow detection system (G-HFD). Rigorously, G-HFD first uses channel state information (CSI) to estimate t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.14140v1-abstract-full').style.display = 'inline'; document.getElementById('2404.14140v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.14140v1-abstract-full" style="display: none;"> Groundbreaking applications such as ChatGPT have heightened research interest in generative artificial intelligence (GAI). Essentially, GAI excels not only in content generation but also in signal processing, offering support for wireless sensing. Hence, we introduce a novel GAI-assisted human flow detection system (G-HFD). Rigorously, G-HFD first uses channel state information (CSI) to estimate the velocity and acceleration of propagation path length change of the human-induced reflection (HIR). Then, given the strong inference ability of the diffusion model, we propose a unified weighted conditional diffusion model (UW-CDM) to denoise the estimation results, enabling the detection of the number of targets. Next, we use the CSI obtained by a uniform linear array with wavelength spacing to estimate the HIR's time of flight and direction of arrival (DoA). In this process, UW-CDM solves the problem of ambiguous DoA spectrum, ensuring accurate DoA estimation. Finally, through clustering, G-HFD determines the number of subflows and the number of targets in each subflow, i.e., the subflow size. The evaluation based on practical downlink communication signals shows G-HFD's accuracy of subflow size detection can reach 91%. This validates its effectiveness and underscores the significant potential of GAI in the context of wireless sensing. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.14140v1-abstract-full').style.display = 'none'; document.getElementById('2404.14140v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.13289">arXiv:2404.13289</a> <span> [<a href="https://arxiv.org/pdf/2404.13289">pdf</a>, <a href="https://arxiv.org/format/2404.13289">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Double Mixture: Towards Continual Event Detection from Speech </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Kang%2C+J">Jingqi Kang</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+T">Tongtong Wu</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+J">Jinming Zhao</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+G">Guitao Wang</a>, <a href="/search/eess?searchtype=author&query=Wei%2C+Y">Yinwei Wei</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+H">Hao Yang</a>, <a href="/search/eess?searchtype=author&query=Qi%2C+G">Guilin Qi</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yuan-Fang Li</a>, <a href="/search/eess?searchtype=author&query=Haffari%2C+G">Gholamreza Haffari</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.13289v2-abstract-short" style="display: inline;"> Speech event detection is crucial for multimedia retrieval, involving the tagging of both semantic and acoustic events. Traditional ASR systems often overlook the interplay between these events, focusing solely on content, even though the interpretation of dialogue can vary with environmental context. This paper tackles two primary challenges in speech event detection: the continual integration of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.13289v2-abstract-full').style.display = 'inline'; document.getElementById('2404.13289v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.13289v2-abstract-full" style="display: none;"> Speech event detection is crucial for multimedia retrieval, involving the tagging of both semantic and acoustic events. Traditional ASR systems often overlook the interplay between these events, focusing solely on content, even though the interpretation of dialogue can vary with environmental context. This paper tackles two primary challenges in speech event detection: the continual integration of new events without forgetting previous ones, and the disentanglement of semantic from acoustic events. We introduce a new task, continual event detection from speech, for which we also provide two benchmark datasets. To address the challenges of catastrophic forgetting and effective disentanglement, we propose a novel method, 'Double Mixture.' This method merges speech expertise with robust memory mechanisms to enhance adaptability and prevent forgetting. Our comprehensive experiments show that this task presents significant challenges that are not effectively addressed by current state-of-the-art methods in either computer vision or natural language processing. Our approach achieves the lowest rates of forgetting and the highest levels of generalization, proving robust across various continual learning sequences. Our code and data are available at https://anonymous.4open.science/status/Continual-SpeechED-6461. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.13289v2-abstract-full').style.display = 'none'; document.getElementById('2404.13289v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The first two authors contributed equally to this work</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.10556">arXiv:2404.10556</a> <span> [<a href="https://arxiv.org/pdf/2404.10556">pdf</a>, <a href="https://arxiv.org/format/2404.10556">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Generative AI for Advanced UAV Networking </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Sun%2C+G">Geng Sun</a>, <a href="/search/eess?searchtype=author&query=Xie%2C+W">Wenwen Xie</a>, <a href="/search/eess?searchtype=author&query=Niyato%2C+D">Dusit Niyato</a>, <a href="/search/eess?searchtype=author&query=Du%2C+H">Hongyang Du</a>, <a href="/search/eess?searchtype=author&query=Kang%2C+J">Jiawen Kang</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+J">Jing Wu</a>, <a href="/search/eess?searchtype=author&query=Sun%2C+S">Sumei Sun</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+P">Ping Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.10556v1-abstract-short" style="display: inline;"> With the impressive achievements of chatGPT and Sora, generative artificial intelligence (GAI) has received increasing attention. Not limited to the field of content generation, GAI is also widely used to solve the problems in wireless communication scenarios due to its powerful learning and generalization capabilities. Therefore, we discuss key applications of GAI in improving unmanned aerial veh… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.10556v1-abstract-full').style.display = 'inline'; document.getElementById('2404.10556v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.10556v1-abstract-full" style="display: none;"> With the impressive achievements of chatGPT and Sora, generative artificial intelligence (GAI) has received increasing attention. Not limited to the field of content generation, GAI is also widely used to solve the problems in wireless communication scenarios due to its powerful learning and generalization capabilities. Therefore, we discuss key applications of GAI in improving unmanned aerial vehicle (UAV) communication and networking performance in this article. Specifically, we first review the key technologies of GAI and the important roles of UAV networking. Then, we show how GAI can improve the communication, networking, and security performances of UAV systems. Subsequently, we propose a novel framework of GAI for advanced UAV networking, and then present a case study of UAV-enabled spectrum map estimation and transmission rate optimization based on the proposed framework to verify the effectiveness of GAI-enabled UAV systems. Finally, we discuss some important open directions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.10556v1-abstract-full').style.display = 'none'; document.getElementById('2404.10556v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.00628">arXiv:2404.00628</a> <span> [<a href="https://arxiv.org/pdf/2404.00628">pdf</a>, <a href="https://arxiv.org/format/2404.00628">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Fluid Antenna Relay Assisted Communication Systems Through Antenna Location Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Xu%2C+R">Ruopeng Xu</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yixuan Chen</a>, <a href="/search/eess?searchtype=author&query=Kang%2C+J">Jiawen Kang</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+M">Minrui Xu</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+Z">Zhaohui Yang</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+C">Chongwen Huang</a>, <a href="/search/eess?searchtype=author&query=Niyato%2C+D">Dusit Niyato</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.00628v2-abstract-short" style="display: inline;"> In this paper, we investigate the problem of resource allocation for fluid antenna relay (FAR) system with antenna location optimization. In the considered model, each user transmits information to a base station (BS) with help of FAR. The antenna location of the FAR is flexible and can be adapted to dynamic location distribution of the users. We formulate a sum rate maximization problem through j… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.00628v2-abstract-full').style.display = 'inline'; document.getElementById('2404.00628v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.00628v2-abstract-full" style="display: none;"> In this paper, we investigate the problem of resource allocation for fluid antenna relay (FAR) system with antenna location optimization. In the considered model, each user transmits information to a base station (BS) with help of FAR. The antenna location of the FAR is flexible and can be adapted to dynamic location distribution of the users. We formulate a sum rate maximization problem through jointly optimizing the antenna location and bandwidth allocation with meeting the minimum rate requirements, total bandwidth budget, and feasible antenna region constraints. To solve this problem, we obtain the optimal bandwidth in closed form. Based on the optimal bandwidth, the original problem is reduced to the antenna location optimization problem and an alternating algorithm is proposed. Simulation results verify the effectiveness of the proposed algorithm and the sum rate can be increased by up to 125% compared to the conventional schemes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.00628v2-abstract-full').style.display = 'none'; document.getElementById('2404.00628v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.19200">arXiv:2403.19200</a> <span> [<a href="https://arxiv.org/pdf/2403.19200">pdf</a>, <a href="https://arxiv.org/format/2403.19200">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Cell-Free MIMO Perceptive Mobile Networks: Cloud vs. Edge Processing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Jeong%2C+S">Seongah Jeong</a>, <a href="/search/eess?searchtype=author&query=Kang%2C+J">Jinkyu Kang</a>, <a href="/search/eess?searchtype=author&query=Simeone%2C+O">Osvaldo Simeone</a>, <a href="/search/eess?searchtype=author&query=Shamai%2C+S">Shlomo Shamai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.19200v1-abstract-short" style="display: inline;"> Perceptive mobile networks implement sensing and communication by reusing existing cellular infrastructure. Cell-free multiple-input multiple-output, thanks to the cooperation among distributed access points, supports the deployment of multistatic radar sensing, while providing high spectral efficiency for data communication services. To this end, the distributed access points communicate over fro… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.19200v1-abstract-full').style.display = 'inline'; document.getElementById('2403.19200v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.19200v1-abstract-full" style="display: none;"> Perceptive mobile networks implement sensing and communication by reusing existing cellular infrastructure. Cell-free multiple-input multiple-output, thanks to the cooperation among distributed access points, supports the deployment of multistatic radar sensing, while providing high spectral efficiency for data communication services. To this end, the distributed access points communicate over fronthaul links with a central processing unit acting as a cloud processor. This work explores four different types of PMN uplink solutions based on Cell-free multiple-input multiple-output, in which the sensing and decoding functionalities are carried out at either cloud or edge. Accordingly, we investigate and compare joint cloud-based decoding and sensing (CDCS), hybrid cloud-based decoding and edge-based sensing (CDES), hybrid edge-based decoding and cloud-based sensing (EDCS) and edge-based decoding and sensing (EDES). In all cases, we target a unified design problem formulation whereby the fronthaul quantization of signals received in the training and data phases are jointly designed to maximize the achievable rate under sensing requirements and fronthaul capacity constraints. Via numerical results, the four implementation scenarios are compared as a function of the available fronthaul resources by highlighting the relative merits of edge- and cloud-based sensing and communications. This study provides guidelines on the optimal functional allocation in fronthaul-constrained networks implementing integrated sensing and communications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.19200v1-abstract-full').style.display = 'none'; document.getElementById('2403.19200v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">30 pages, 11 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.14154">arXiv:2403.14154</a> <span> [<a href="https://arxiv.org/pdf/2403.14154">pdf</a>, <a href="https://arxiv.org/format/2403.14154">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> LR-FHSS Transceiver for Direct-to-Satellite IoT Communications: Design, Implementation, and Verification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Jung%2C+S">Sooyeob Jung</a>, <a href="/search/eess?searchtype=author&query=Jeong%2C+S">Seongah Jeong</a>, <a href="/search/eess?searchtype=author&query=Kang%2C+J">Jinkyu Kang</a>, <a href="/search/eess?searchtype=author&query=Im%2C+G">Gyeongrae Im</a>, <a href="/search/eess?searchtype=author&query=Lee%2C+S">Sangjae Lee</a>, <a href="/search/eess?searchtype=author&query=Oh%2C+M">Mi-Kyung Oh</a>, <a href="/search/eess?searchtype=author&query=Ryu%2C+J+G">Joon Gyu Ryu</a>, <a href="/search/eess?searchtype=author&query=Kang%2C+J">Joonhyuk Kang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.14154v1-abstract-short" style="display: inline;"> This paper proposes a long range-frequency hopping spread spectrum (LR-FHSS) transceiver design for the Direct-to-Satellite Internet of Things (DtS-IoT) communication system. The DtS-IoT system has recently attracted attention as a promising nonterrestrial network (NTN) solution to provide high-traffic and low-latency data transfer services to IoT devices in global coverage. In particular, this st… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.14154v1-abstract-full').style.display = 'inline'; document.getElementById('2403.14154v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.14154v1-abstract-full" style="display: none;"> This paper proposes a long range-frequency hopping spread spectrum (LR-FHSS) transceiver design for the Direct-to-Satellite Internet of Things (DtS-IoT) communication system. The DtS-IoT system has recently attracted attention as a promising nonterrestrial network (NTN) solution to provide high-traffic and low-latency data transfer services to IoT devices in global coverage. In particular, this study provides guidelines for the overall DtS-IoT system architecture and design details that conform to the Long Range Wide-Area Network (LoRaWAN). Furthermore, we also detail various DtS-IoT use cases. Considering the multiple low-Earth orbit (LEO) satellites, we developed the LR-FHSS transceiver to improve system efficiency, which is the first attempt in real satellite communication systems using LR-FHSS. Moreover, as an extension of our previous work with perfect synchronization, we applied a robust synchronization scheme against the Doppler effect and co-channel interference (CCI) caused by LEO satellite channel environments, including signal detection for the simultaneous reception of numerous frequency hopping signals and an enhanced soft-output-Viterbi-algorithm (SOVA) for the header and payload receptions. Lastly, we present proof-of-concept implementation and testbeds using an application-specific integrated circuit (ASIC) chipset and a field-programmable gate array (FPGA) that verify the performance of the proposed LR-FHSS transceiver design of DtS-IoT communication systems. The laboratory test results reveal that the proposed LR-FHSS-based framework with the robust synchronization technique can provide wide coverage, seamless connectivity, and high throughput communication links for the realization of future sixth-generation (6G) networks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.14154v1-abstract-full').style.display = 'none'; document.getElementById('2403.14154v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17pages, 23 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.05826">arXiv:2403.05826</a> <span> [<a href="https://arxiv.org/pdf/2403.05826">pdf</a>, <a href="https://arxiv.org/format/2403.05826">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Cached Model-as-a-Resource: Provisioning Large Language Model Agents for Edge Intelligence in Space-air-ground Integrated Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Xu%2C+M">Minrui Xu</a>, <a href="/search/eess?searchtype=author&query=Niyato%2C+D">Dusit Niyato</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+H">Hongliang Zhang</a>, <a href="/search/eess?searchtype=author&query=Kang%2C+J">Jiawen Kang</a>, <a href="/search/eess?searchtype=author&query=Xiong%2C+Z">Zehui Xiong</a>, <a href="/search/eess?searchtype=author&query=Mao%2C+S">Shiwen Mao</a>, <a href="/search/eess?searchtype=author&query=Han%2C+Z">Zhu Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.05826v2-abstract-short" style="display: inline;"> Edge intelligence in space-air-ground integrated networks (SAGINs) can enable worldwide network coverage beyond geographical limitations for users to access ubiquitous and low-latency intelligence services. Facing global coverage and complex environments in SAGINs, edge intelligence can provision approximate large language models (LLMs) agents for users via edge servers at ground base stations (BS… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.05826v2-abstract-full').style.display = 'inline'; document.getElementById('2403.05826v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.05826v2-abstract-full" style="display: none;"> Edge intelligence in space-air-ground integrated networks (SAGINs) can enable worldwide network coverage beyond geographical limitations for users to access ubiquitous and low-latency intelligence services. Facing global coverage and complex environments in SAGINs, edge intelligence can provision approximate large language models (LLMs) agents for users via edge servers at ground base stations (BSs) or cloud data centers relayed by satellites. As LLMs with billions of parameters are pre-trained on vast datasets, LLM agents have few-shot learning capabilities, e.g., chain-of-thought (CoT) prompting for complex tasks, which raises a new trade-off between resource consumption and performance in SAGINs. In this paper, we propose a joint caching and inference framework for edge intelligence to provision sustainable and ubiquitous LLM agents in SAGINs. We introduce "cached model-as-a-resource" for offering LLMs with limited context windows and propose a novel optimization framework, i.e., joint model caching and inference, to utilize cached model resources for provisioning LLM agent services along with communication, computing, and storage resources. We design "age of thought" (AoT) considering the CoT prompting of LLMs, and propose a least AoT cached model replacement algorithm for optimizing the provisioning cost. We propose a deep Q-network-based modified second-bid (DQMSB) auction to incentivize network operators, which can enhance allocation efficiency by 23% while guaranteeing strategy-proofness and free from adverse selection. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.05826v2-abstract-full').style.display = 'none'; document.getElementById('2403.05826v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.09756">arXiv:2402.09756</a> <span> [<a href="https://arxiv.org/pdf/2402.09756">pdf</a>, <a href="https://arxiv.org/format/2402.09756">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Mixture of Experts for Network Optimization: A Large Language Model-enabled Approach </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Du%2C+H">Hongyang Du</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+G">Guangyuan Liu</a>, <a href="/search/eess?searchtype=author&query=Lin%2C+Y">Yijing Lin</a>, <a href="/search/eess?searchtype=author&query=Niyato%2C+D">Dusit Niyato</a>, <a href="/search/eess?searchtype=author&query=Kang%2C+J">Jiawen Kang</a>, <a href="/search/eess?searchtype=author&query=Xiong%2C+Z">Zehui Xiong</a>, <a href="/search/eess?searchtype=author&query=Kim%2C+D+I">Dong In Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.09756v1-abstract-short" style="display: inline;"> Optimizing various wireless user tasks poses a significant challenge for networking systems because of the expanding range of user requirements. Despite advancements in Deep Reinforcement Learning (DRL), the need for customized optimization tasks for individual users complicates developing and applying numerous DRL models, leading to substantial computation resource and energy consumption and can… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.09756v1-abstract-full').style.display = 'inline'; document.getElementById('2402.09756v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.09756v1-abstract-full" style="display: none;"> Optimizing various wireless user tasks poses a significant challenge for networking systems because of the expanding range of user requirements. Despite advancements in Deep Reinforcement Learning (DRL), the need for customized optimization tasks for individual users complicates developing and applying numerous DRL models, leading to substantial computation resource and energy consumption and can lead to inconsistent outcomes. To address this issue, we propose a novel approach utilizing a Mixture of Experts (MoE) framework, augmented with Large Language Models (LLMs), to analyze user objectives and constraints effectively, select specialized DRL experts, and weigh each decision from the participating experts. Specifically, we develop a gate network to oversee the expert models, allowing a collective of experts to tackle a wide array of new tasks. Furthermore, we innovatively substitute the traditional gate network with an LLM, leveraging its advanced reasoning capabilities to manage expert model selection for joint decisions. Our proposed method reduces the need to train new DRL models for each unique optimization problem, decreasing energy consumption and AI model implementation costs. The LLM-enabled MoE approach is validated through a general maze navigation task and a specific network service provider utility maximization task, demonstrating its effectiveness and practical applicability in optimizing complex networking systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.09756v1-abstract-full').style.display = 'none'; document.getElementById('2402.09756v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.07120">arXiv:2401.07120</a> <span> [<a href="https://arxiv.org/pdf/2401.07120">pdf</a>, <a href="https://arxiv.org/format/2401.07120">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Quantum Physics">quant-ph</span> </div> </div> <p class="title is-5 mathjax"> Generative AI-enabled Quantum Computing Networks and Intelligent Resource Allocation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Xu%2C+M">Minrui Xu</a>, <a href="/search/eess?searchtype=author&query=Niyato%2C+D">Dusit Niyato</a>, <a href="/search/eess?searchtype=author&query=Kang%2C+J">Jiawen Kang</a>, <a href="/search/eess?searchtype=author&query=Xiong%2C+Z">Zehui Xiong</a>, <a href="/search/eess?searchtype=author&query=Cao%2C+Y">Yuan Cao</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+Y">Yulan Gao</a>, <a href="/search/eess?searchtype=author&query=Ren%2C+C">Chao Ren</a>, <a href="/search/eess?searchtype=author&query=Yu%2C+H">Han Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.07120v1-abstract-short" style="display: inline;"> Quantum computing networks enable scalable collaboration and secure information exchange among multiple classical and quantum computing nodes while executing large-scale generative AI computation tasks and advanced quantum algorithms. Quantum computing networks overcome limitations such as the number of qubits and coherence time of entangled pairs and offer advantages for generative AI infrastruct… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.07120v1-abstract-full').style.display = 'inline'; document.getElementById('2401.07120v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.07120v1-abstract-full" style="display: none;"> Quantum computing networks enable scalable collaboration and secure information exchange among multiple classical and quantum computing nodes while executing large-scale generative AI computation tasks and advanced quantum algorithms. Quantum computing networks overcome limitations such as the number of qubits and coherence time of entangled pairs and offer advantages for generative AI infrastructure, including enhanced noise reduction through distributed processing and improved scalability by connecting multiple quantum devices. However, efficient resource allocation in quantum computing networks is a critical challenge due to factors including qubit variability and network complexity. In this article, we propose an intelligent resource allocation framework for quantum computing networks to improve network scalability with minimized resource costs. To achieve scalability in quantum computing networks, we formulate the resource allocation problem as stochastic programming, accounting for the uncertain fidelities of qubits and entangled pairs. Furthermore, we introduce state-of-the-art reinforcement learning (RL) algorithms, from generative learning to quantum machine learning for optimal quantum resource allocation to resolve the proposed stochastic resource allocation problem efficiently. Finally, we optimize the resource allocation in heterogeneous quantum computing networks supporting quantum generative learning applications and propose a multi-agent RL-based algorithm to learn the optimal resource allocation policies without prior knowledge. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.07120v1-abstract-full').style.display = 'none'; document.getElementById('2401.07120v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.04152">arXiv:2401.04152</a> <span> [<a href="https://arxiv.org/pdf/2401.04152">pdf</a>, <a href="https://arxiv.org/format/2401.04152">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/ICASSP48485.2024.10446249">10.1109/ICASSP48485.2024.10446249 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Cross-Speaker Encoding Network for Multi-Talker Speech Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Kang%2C+J">Jiawen Kang</a>, <a href="/search/eess?searchtype=author&query=Meng%2C+L">Lingwei Meng</a>, <a href="/search/eess?searchtype=author&query=Cui%2C+M">Mingyu Cui</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+H">Haohan Guo</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+X">Xixin Wu</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+X">Xunying Liu</a>, <a href="/search/eess?searchtype=author&query=Meng%2C+H">Helen Meng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.04152v2-abstract-short" style="display: inline;"> End-to-end multi-talker speech recognition has garnered great interest as an effective approach to directly transcribe overlapped speech from multiple speakers. Current methods typically adopt either 1) single-input multiple-output (SIMO) models with a branched encoder, or 2) single-input single-output (SISO) models based on attention-based encoder-decoder architecture with serialized output train… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.04152v2-abstract-full').style.display = 'inline'; document.getElementById('2401.04152v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.04152v2-abstract-full" style="display: none;"> End-to-end multi-talker speech recognition has garnered great interest as an effective approach to directly transcribe overlapped speech from multiple speakers. Current methods typically adopt either 1) single-input multiple-output (SIMO) models with a branched encoder, or 2) single-input single-output (SISO) models based on attention-based encoder-decoder architecture with serialized output training (SOT). In this work, we propose a Cross-Speaker Encoding (CSE) network to address the limitations of SIMO models by aggregating cross-speaker representations. Furthermore, the CSE model is integrated with SOT to leverage both the advantages of SIMO and SISO while mitigating their drawbacks. To the best of our knowledge, this work represents an early effort to integrate SIMO and SISO for multi-talker speech recognition. Experiments on the two-speaker LibrispeechMix dataset show that the CES model reduces word error rate (WER) by 8% over the SIMO baseline. The CSE-SOT model reduces WER by 10% overall and by 16% on high-overlap speech compared to the SOT model. Code is available at https://github.com/kjw11/CSEnet-ASR. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.04152v2-abstract-full').style.display = 'none'; document.getElementById('2401.04152v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICASSP2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.00160">arXiv:2401.00160</a> <span> [<a href="https://arxiv.org/pdf/2401.00160">pdf</a>, <a href="https://arxiv.org/format/2401.00160">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Acceleration Estimation of Signal Propagation Path Length Changes for Wireless Sensing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+J">Jiacheng Wang</a>, <a href="/search/eess?searchtype=author&query=Du%2C+H">Hongyang Du</a>, <a href="/search/eess?searchtype=author&query=Niyato%2C+D">Dusit Niyato</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+M">Mu Zhou</a>, <a href="/search/eess?searchtype=author&query=Kang%2C+J">Jiawen Kang</a>, <a href="/search/eess?searchtype=author&query=Poor%2C+H+V">H. Vincent Poor</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.00160v1-abstract-short" style="display: inline;"> As indoor applications grow in diversity, wireless sensing, vital in areas like localization and activity recognition, is attracting renewed interest. Indoor wireless sensing relies on signal processing, particularly channel state information (CSI) based signal parameter estimation. Nonetheless, regarding reflected signals induced by dynamic human targets, no satisfactory algorithm yet exists for… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.00160v1-abstract-full').style.display = 'inline'; document.getElementById('2401.00160v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.00160v1-abstract-full" style="display: none;"> As indoor applications grow in diversity, wireless sensing, vital in areas like localization and activity recognition, is attracting renewed interest. Indoor wireless sensing relies on signal processing, particularly channel state information (CSI) based signal parameter estimation. Nonetheless, regarding reflected signals induced by dynamic human targets, no satisfactory algorithm yet exists for estimating the acceleration of dynamic path length change (DPLC), which is crucial for various sensing tasks in this context. Hence, this paper proposes DP-AcE, a CSI-based DPLC acceleration estimation algorithm. We first model the relationship between the phase difference of adjacent CSI measurements and the DPLC's acceleration. Unlike existing works assuming constant velocity, DP-AcE considers both velocity and acceleration, yielding a more accurate and objective representation. Using this relationship, an algorithm combining scaling with Fourier transform is proposed to realize acceleration estimation. We evaluate DP-AcE via the acceleration estimation and acceleration-based fall detection with the collected CSI. Experimental results reveal that, using distance as the metric, DP-AcE achieves a median acceleration estimation percentage error of 4.38%. Furthermore, in multi-target scenarios, the fall detection achieves an average true positive rate of 89.56% and a false positive rate of 11.78%, demonstrating its importance in enhancing indoor wireless sensing capabilities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.00160v1-abstract-full').style.display = 'none'; document.getElementById('2401.00160v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.00124">arXiv:2401.00124</a> <span> [<a href="https://arxiv.org/pdf/2401.00124">pdf</a>, <a href="https://arxiv.org/format/2401.00124">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Generative AI-driven Semantic Communication Networks: Architecture, Technologies and Applications </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Liang%2C+C">Chengsi Liang</a>, <a href="/search/eess?searchtype=author&query=Du%2C+H">Hongyang Du</a>, <a href="/search/eess?searchtype=author&query=Sun%2C+Y">Yao Sun</a>, <a href="/search/eess?searchtype=author&query=Niyato%2C+D">Dusit Niyato</a>, <a href="/search/eess?searchtype=author&query=Kang%2C+J">Jiawen Kang</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+D">Dezong Zhao</a>, <a href="/search/eess?searchtype=author&query=Imran%2C+M+A">Muhammad Ali Imran</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.00124v2-abstract-short" style="display: inline;"> Generative artificial intelligence (GAI) has emerged as a rapidly burgeoning field demonstrating significant potential in creating diverse contents intelligently and automatically. To support such artificial intelligence-generated content (AIGC) services, future communication systems should fulfill much more stringent requirements (including data rate, throughput, latency, etc.) with limited yet p… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.00124v2-abstract-full').style.display = 'inline'; document.getElementById('2401.00124v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.00124v2-abstract-full" style="display: none;"> Generative artificial intelligence (GAI) has emerged as a rapidly burgeoning field demonstrating significant potential in creating diverse contents intelligently and automatically. To support such artificial intelligence-generated content (AIGC) services, future communication systems should fulfill much more stringent requirements (including data rate, throughput, latency, etc.) with limited yet precious spectrum resources. To tackle this challenge, semantic communication (SemCom), dramatically reducing resource consumption via extracting and transmitting semantics, has been deemed as a revolutionary communication scheme. The advanced GAI algorithms facilitate SemCom on sophisticated intelligence for model training, knowledge base construction and channel adaption. Furthermore, GAI algorithms also play an important role in the management of SemCom networks. In this survey, we first overview the basics of GAI and SemCom as well as the synergies of the two technologies. Especially, the GAI-driven SemCom framework is presented, where many GAI models for information creation, SemCom-enabled information transmission and information effectiveness for AIGC are discussed separately. We then delve into the GAI-driven SemCom network management involving with novel management layers, knowledge management, and resource allocation. Finally, we envision several promising use cases, i.e., autonomous driving, smart city, and the Metaverse for a more comprehensive exploration. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.00124v2-abstract-full').style.display = 'none'; document.getElementById('2401.00124v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.04081">arXiv:2312.04081</a> <span> [<a href="https://arxiv.org/pdf/2312.04081">pdf</a>, <a href="https://arxiv.org/ps/2312.04081">ps</a>, <a href="https://arxiv.org/format/2312.04081">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Rate-splitting Multiple Access for Hierarchical HAP-LAP Networks under Limited Fronthaul </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Kim%2C+J">Jeongbin Kim</a>, <a href="/search/eess?searchtype=author&query=Jeong%2C+S">Seongah Jeong</a>, <a href="/search/eess?searchtype=author&query=Yoo%2C+S">Seonghoon Yoo</a>, <a href="/search/eess?searchtype=author&query=Son%2C+W">Woong Son</a>, <a href="/search/eess?searchtype=author&query=Kang%2C+J">Joonhyuk Kang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.04081v1-abstract-short" style="display: inline;"> In this correspondence, we propose hierarchical high-altitude platform (HAP)-low-altitude platform (LAP) networks with the aim of maximizing the sum-rate of ground user equipments (UEs). The multiple aerial radio units (RUs) mounted on HAPs and LAPs are managed by the central unit (CU) via constrained fronthaul links. The limitation of fronthaul capacity can be addressed through quantization, empl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.04081v1-abstract-full').style.display = 'inline'; document.getElementById('2312.04081v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.04081v1-abstract-full" style="display: none;"> In this correspondence, we propose hierarchical high-altitude platform (HAP)-low-altitude platform (LAP) networks with the aim of maximizing the sum-rate of ground user equipments (UEs). The multiple aerial radio units (RUs) mounted on HAPs and LAPs are managed by the central unit (CU) via constrained fronthaul links. The limitation of fronthaul capacity can be addressed through quantization, employing the cloud radio access network (C-RAN) architecture. For spectral efficiency, we adopt the rate-splitting multiple access (RSMA), leveraging the advantages of both space-division multiple access (SDMA) and non-orthogonal multiple access (NOMA). To achieve this, we jointly optimize rate splitting, transmit power allocation, quantization noise variance, and UAV placement using an alternating optimization (AO) approach coupled with successive convex approximation (SCA) and the weighted minimum mean square error (WMMSE) method. Numerical results validate the superior performance of the proposed method compared to benchmark schemes, including partial optimizations or those without the assistance of LAPs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.04081v1-abstract-full').style.display = 'none'; document.getElementById('2312.04081v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.02669">arXiv:2312.02669</a> <span> [<a href="https://arxiv.org/pdf/2312.02669">pdf</a>, <a href="https://arxiv.org/format/2312.02669">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Optics">physics.optics</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Deep-learning-driven end-to-end metalens imaging </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Seo%2C+J">Joonhyuk Seo</a>, <a href="/search/eess?searchtype=author&query=Jo%2C+J">Jaegang Jo</a>, <a href="/search/eess?searchtype=author&query=Kim%2C+J">Joohoon Kim</a>, <a href="/search/eess?searchtype=author&query=Kang%2C+J">Joonho Kang</a>, <a href="/search/eess?searchtype=author&query=Kang%2C+C">Chanik Kang</a>, <a href="/search/eess?searchtype=author&query=Moon%2C+S">Seongwon Moon</a>, <a href="/search/eess?searchtype=author&query=Lee%2C+E">Eunji Lee</a>, <a href="/search/eess?searchtype=author&query=Hong%2C+J">Jehyeong Hong</a>, <a href="/search/eess?searchtype=author&query=Rho%2C+J">Junsuk Rho</a>, <a href="/search/eess?searchtype=author&query=Chung%2C+H">Haejun Chung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.02669v3-abstract-short" style="display: inline;"> Recent advances in metasurface lenses (metalenses) have shown great potential for opening a new era in compact imaging, photography, light detection and ranging (LiDAR), and virtual reality/augmented reality (VR/AR) applications. However, the fundamental trade-off between broadband focusing efficiency and operating bandwidth limits the performance of broadband metalenses, resulting in chromatic ab… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.02669v3-abstract-full').style.display = 'inline'; document.getElementById('2312.02669v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.02669v3-abstract-full" style="display: none;"> Recent advances in metasurface lenses (metalenses) have shown great potential for opening a new era in compact imaging, photography, light detection and ranging (LiDAR), and virtual reality/augmented reality (VR/AR) applications. However, the fundamental trade-off between broadband focusing efficiency and operating bandwidth limits the performance of broadband metalenses, resulting in chromatic aberration, angular aberration, and a relatively low efficiency. In this study, a deep-learning-based image restoration framework is proposed to overcome these limitations and realize end-to-end metalens imaging, thereby achieving aberration-free full-color imaging for mass-produced metalenses with 10-mm diameter. Neural-network-assisted metalens imaging achieved a high resolution comparable to that of the ground truth image. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.02669v3-abstract-full').style.display = 'none'; document.getElementById('2312.02669v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages, 7 figures, 1 table</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Kang%2C+J&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Kang%2C+J&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Kang%2C+J&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Kang%2C+J&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>