CINXE.COM
Search | arXiv e-print repository
<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 240 results for author: <span class="mathjax">Huang, Z</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> <div class="content"> <form method="GET" action="/search/eess" aria-role="search"> Searching in archive <strong>eess</strong>. <a href="/search/?searchtype=author&query=Huang%2C+Z">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Huang, Z"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Huang%2C+Z&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Huang, Z"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Huang%2C+Z&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Huang%2C+Z&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Huang%2C+Z&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Huang%2C+Z&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Huang%2C+Z&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Huang%2C+Z&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11946">arXiv:2502.11946</a> <span> [<a href="https://arxiv.org/pdf/2502.11946">pdf</a>, <a href="https://arxiv.org/format/2502.11946">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Step-Audio: Unified Understanding and Generation in Intelligent Speech Interaction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Huang%2C+A">Ailin Huang</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+B">Boyong Wu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+B">Bruce Wang</a>, <a href="/search/eess?searchtype=author&query=Yan%2C+C">Chao Yan</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+C">Chen Hu</a>, <a href="/search/eess?searchtype=author&query=Feng%2C+C">Chengli Feng</a>, <a href="/search/eess?searchtype=author&query=Tian%2C+F">Fei Tian</a>, <a href="/search/eess?searchtype=author&query=Shen%2C+F">Feiyu Shen</a>, <a href="/search/eess?searchtype=author&query=Li%2C+J">Jingbei Li</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+M">Mingrui Chen</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+P">Peng Liu</a>, <a href="/search/eess?searchtype=author&query=Miao%2C+R">Ruihang Miao</a>, <a href="/search/eess?searchtype=author&query=You%2C+W">Wang You</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+X">Xi Chen</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+X">Xuerui Yang</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+Y">Yechang Huang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Y">Yuxiang Zhang</a>, <a href="/search/eess?searchtype=author&query=Gong%2C+Z">Zheng Gong</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Z">Zixin Zhang</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+H">Hongyu Zhou</a>, <a href="/search/eess?searchtype=author&query=Sun%2C+J">Jianjian Sun</a>, <a href="/search/eess?searchtype=author&query=Li%2C+B">Brian Li</a>, <a href="/search/eess?searchtype=author&query=Feng%2C+C">Chengting Feng</a>, <a href="/search/eess?searchtype=author&query=Wan%2C+C">Changyi Wan</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+H">Hanpeng Hu</a> , et al. (120 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11946v2-abstract-short" style="display: inline;"> Real-time speech interaction, serving as a fundamental interface for human-machine collaboration, holds immense potential. However, current open-source models face limitations such as high costs in voice data collection, weakness in dynamic control, and limited intelligence. To address these challenges, this paper introduces Step-Audio, the first production-ready open-source solution. Key contribu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11946v2-abstract-full').style.display = 'inline'; document.getElementById('2502.11946v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11946v2-abstract-full" style="display: none;"> Real-time speech interaction, serving as a fundamental interface for human-machine collaboration, holds immense potential. However, current open-source models face limitations such as high costs in voice data collection, weakness in dynamic control, and limited intelligence. To address these challenges, this paper introduces Step-Audio, the first production-ready open-source solution. Key contributions include: 1) a 130B-parameter unified speech-text multi-modal model that achieves unified understanding and generation, with the Step-Audio-Chat version open-sourced; 2) a generative speech data engine that establishes an affordable voice cloning framework and produces the open-sourced lightweight Step-Audio-TTS-3B model through distillation; 3) an instruction-driven fine control system enabling dynamic adjustments across dialects, emotions, singing, and RAP; 4) an enhanced cognitive architecture augmented with tool calling and role-playing abilities to manage complex tasks effectively. Based on our new StepEval-Audio-360 evaluation benchmark, Step-Audio achieves state-of-the-art performance in human evaluations, especially in terms of instruction following. On open-source benchmarks like LLaMA Question, shows 9.3% average performance improvement, demonstrating our commitment to advancing the development of open-source multi-modal language technologies. Our code and models are available at https://github.com/stepfun-ai/Step-Audio. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11946v2-abstract-full').style.display = 'none'; document.getElementById('2502.11946v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.00358">arXiv:2502.00358</a> <span> [<a href="https://arxiv.org/pdf/2502.00358">pdf</a>, <a href="https://arxiv.org/format/2502.00358">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Do Audio-Visual Segmentation Models Truly Segment Sounding Objects? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+J">Jia Li</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+W">Wenjie Zhao</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+Z">Ziru Huang</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+Y">Yunhui Guo</a>, <a href="/search/eess?searchtype=author&query=Tian%2C+Y">Yapeng Tian</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.00358v1-abstract-short" style="display: inline;"> Unlike traditional visual segmentation, audio-visual segmentation (AVS) requires the model not only to identify and segment objects but also to determine whether they are sound sources. Recent AVS approaches, leveraging transformer architectures and powerful foundation models like SAM, have achieved impressive performance on standard benchmarks. Yet, an important question remains: Do these models… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00358v1-abstract-full').style.display = 'inline'; document.getElementById('2502.00358v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.00358v1-abstract-full" style="display: none;"> Unlike traditional visual segmentation, audio-visual segmentation (AVS) requires the model not only to identify and segment objects but also to determine whether they are sound sources. Recent AVS approaches, leveraging transformer architectures and powerful foundation models like SAM, have achieved impressive performance on standard benchmarks. Yet, an important question remains: Do these models genuinely integrate audio-visual cues to segment sounding objects? In this paper, we systematically investigate this issue in the context of robust AVS. Our study reveals a fundamental bias in current methods: they tend to generate segmentation masks based predominantly on visual salience, irrespective of the audio context. This bias results in unreliable predictions when sounds are absent or irrelevant. To address this challenge, we introduce AVSBench-Robust, a comprehensive benchmark incorporating diverse negative audio scenarios including silence, ambient noise, and off-screen sounds. We also propose a simple yet effective approach combining balanced training with negative samples and classifier-guided similarity learning. Our extensive experiments show that state-of-theart AVS methods consistently fail under negative audio conditions, demonstrating the prevalence of visual bias. In contrast, our approach achieves remarkable improvements in both standard metrics and robustness measures, maintaining near-perfect false positive rates while preserving highquality segmentation performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00358v1-abstract-full').style.display = 'none'; document.getElementById('2502.00358v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.14273">arXiv:2501.14273</a> <span> [<a href="https://arxiv.org/pdf/2501.14273">pdf</a>, <a href="https://arxiv.org/format/2501.14273">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Characteristic-Specific Partial Fine-Tuning for Efficient Emotion and Speaker Adaptation in Codec Language Text-to-Speech Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+T">Tianrui Wang</a>, <a href="/search/eess?searchtype=author&query=Ge%2C+M">Meng Ge</a>, <a href="/search/eess?searchtype=author&query=Gong%2C+C">Cheng Gong</a>, <a href="/search/eess?searchtype=author&query=Qiang%2C+C">Chunyu Qiang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+H">Haoyu Wang</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+Z">Zikang Huang</a>, <a href="/search/eess?searchtype=author&query=Jiang%2C+Y">Yu Jiang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+X">Xiaobao Wang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+X">Xie Chen</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+L">Longbiao Wang</a>, <a href="/search/eess?searchtype=author&query=Dang%2C+J">Jianwu Dang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.14273v1-abstract-short" style="display: inline;"> Recently, emotional speech generation and speaker cloning have garnered significant interest in text-to-speech (TTS). With the open-sourcing of codec language TTS models trained on massive datasets with large-scale parameters, adapting these general pre-trained TTS models to generate speech with specific emotional expressions and target speaker characteristics has become a topic of great attention… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14273v1-abstract-full').style.display = 'inline'; document.getElementById('2501.14273v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.14273v1-abstract-full" style="display: none;"> Recently, emotional speech generation and speaker cloning have garnered significant interest in text-to-speech (TTS). With the open-sourcing of codec language TTS models trained on massive datasets with large-scale parameters, adapting these general pre-trained TTS models to generate speech with specific emotional expressions and target speaker characteristics has become a topic of great attention. Common approaches, such as full and adapter-based fine-tuning, often overlook the specific contributions of model parameters to emotion and speaker control. Treating all parameters uniformly during fine-tuning, especially when the target data has limited content diversity compared to the pre-training corpus, results in slow training speed and an increased risk of catastrophic forgetting. To address these challenges, we propose a characteristic-specific partial fine-tuning strategy, short as CSP-FT. First, we use a weighted-sum approach to analyze the contributions of different Transformer layers in a pre-trained codec language TTS model for emotion and speaker control in the generated speech. We then selectively fine-tune the layers with the highest and lowest characteristic-specific contributions to generate speech with target emotional expression and speaker identity. Experimental results demonstrate that our method achieves performance comparable to, or even surpassing, full fine-tuning in generating speech with specific emotional expressions and speaker identities. Additionally, CSP-FT delivers approximately 2x faster training speeds, fine-tunes only around 8% of parameters, and significantly reduces catastrophic forgetting. Furthermore, we show that codec language TTS models perform competitively with self-supervised models in speaker identification and emotion classification tasks, offering valuable insights for developing universal speech processing models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14273v1-abstract-full').style.display = 'none'; document.getElementById('2501.14273v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.11274">arXiv:2501.11274</a> <span> [<a href="https://arxiv.org/pdf/2501.11274">pdf</a>, <a href="https://arxiv.org/format/2501.11274">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> SEF-PNet: Speaker Encoder-Free Personalized Speech Enhancement with Local and Global Contexts Aggregation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Huang%2C+Z">Ziling Huang</a>, <a href="/search/eess?searchtype=author&query=Guan%2C+H">Haixin Guan</a>, <a href="/search/eess?searchtype=author&query=Wei%2C+H">Haoran Wei</a>, <a href="/search/eess?searchtype=author&query=Long%2C+Y">Yanhua Long</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.11274v1-abstract-short" style="display: inline;"> Personalized speech enhancement (PSE) methods typically rely on pre-trained speaker verification models or self-designed speaker encoders to extract target speaker clues, guiding the PSE model in isolating the desired speech. However, these approaches suffer from significant model complexity and often underutilize enrollment speaker information, limiting the potential performance of the PSE model.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.11274v1-abstract-full').style.display = 'inline'; document.getElementById('2501.11274v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.11274v1-abstract-full" style="display: none;"> Personalized speech enhancement (PSE) methods typically rely on pre-trained speaker verification models or self-designed speaker encoders to extract target speaker clues, guiding the PSE model in isolating the desired speech. However, these approaches suffer from significant model complexity and often underutilize enrollment speaker information, limiting the potential performance of the PSE model. To address these limitations, we propose a novel Speaker Encoder-Free PSE network, termed SEF-PNet, which fully exploits the information present in both the enrollment speech and noisy mixtures. SEF-PNet incorporates two key innovations: Interactive Speaker Adaptation (ISA) and Local-Global Context Aggregation (LCA). ISA dynamically modulates the interactions between enrollment and noisy signals to enhance the speaker adaptation, while LCA employs advanced channel attention within the PSE encoder to effectively integrate local and global contextual information, thus improving feature learning. Experiments on the Libri2Mix dataset demonstrate that SEF-PNet significantly outperforms baseline models, achieving state-of-the-art PSE performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.11274v1-abstract-full').style.display = 'none'; document.getElementById('2501.11274v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accpeted by ICASSP2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.10407">arXiv:2501.10407</a> <span> [<a href="https://arxiv.org/pdf/2501.10407">pdf</a>, <a href="https://arxiv.org/format/2501.10407">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> RadDet: A Wideband Dataset for Real-Time Radar Spectrum Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Huang%2C+Z">Zi Huang</a>, <a href="/search/eess?searchtype=author&query=Denman%2C+S">Simon Denman</a>, <a href="/search/eess?searchtype=author&query=Pemasiri%2C+A">Akila Pemasiri</a>, <a href="/search/eess?searchtype=author&query=Martin%2C+T">Terrence Martin</a>, <a href="/search/eess?searchtype=author&query=Fookes%2C+C">Clinton Fookes</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.10407v1-abstract-short" style="display: inline;"> Real-time detection of radar signals in a wideband radio frequency spectrum is a critical situational assessment function in electronic warfare. Compute-efficient detection models have shown great promise in recent years, providing an opportunity to tackle the spectrum detection problem. However, progress in radar spectrum detection is limited by the scarcity of publicly available wideband radar s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10407v1-abstract-full').style.display = 'inline'; document.getElementById('2501.10407v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.10407v1-abstract-full" style="display: none;"> Real-time detection of radar signals in a wideband radio frequency spectrum is a critical situational assessment function in electronic warfare. Compute-efficient detection models have shown great promise in recent years, providing an opportunity to tackle the spectrum detection problem. However, progress in radar spectrum detection is limited by the scarcity of publicly available wideband radar signal datasets accompanied by corresponding annotations. To address this challenge, we introduce a novel and challenging dataset for radar detection (RadDet), comprising a large corpus of radar signals occupying a wideband spectrum across diverse radar density environments and signal-to-noise ratios (SNR). RadDet contains 40,000 frames, each generated from 1 million in-phase and quadrature (I/Q) samples across a 500 MHz frequency band. RadDet includes 11 classes of radar samples across 6 different SNR settings, 2 radar density environments, and 3 different time-frequency resolutions, with corresponding time-frequency and class annotations. We evaluate the performance of various state-of-the-art real-time detection models on RadDet and a modified radar classification dataset from NIST (NIST-CBRS) to establish a novel benchmark for wideband radar spectrum detection. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10407v1-abstract-full').style.display = 'none'; document.getElementById('2501.10407v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 13 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.08825">arXiv:2501.08825</a> <span> [<a href="https://arxiv.org/pdf/2501.08825">pdf</a>, <a href="https://arxiv.org/format/2501.08825">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> A Multi-modal Intelligent Channel Model for 6G Multi-UAV-to-Multi-Vehicle Communications </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Bai%2C+L">Lu Bai</a>, <a href="/search/eess?searchtype=author&query=Lu%2C+M">Mengyuan Lu</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+Z">Ziwei Huang</a>, <a href="/search/eess?searchtype=author&query=Cheng%2C+X">Xiang Cheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.08825v1-abstract-short" style="display: inline;"> In this paper, a novel multi-modal intelligent channel model for sixth-generation (6G) multiple-unmanned aerial vehicle (multi-UAV)-to-multi-vehicle communications is proposed. To thoroughly explore the mapping relationship between the physical environment and the electromagnetic space in the complex multi-UAV-to-multi-vehicle scenario, two new parameters, i.e., terrestrial traffic density (TTD) a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.08825v1-abstract-full').style.display = 'inline'; document.getElementById('2501.08825v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.08825v1-abstract-full" style="display: none;"> In this paper, a novel multi-modal intelligent channel model for sixth-generation (6G) multiple-unmanned aerial vehicle (multi-UAV)-to-multi-vehicle communications is proposed. To thoroughly explore the mapping relationship between the physical environment and the electromagnetic space in the complex multi-UAV-to-multi-vehicle scenario, two new parameters, i.e., terrestrial traffic density (TTD) and aerial traffic density (ATD), are developed and a new sensing-communication intelligent integrated dataset is constructed in suburban scenario under different TTD and ATD conditions. With the aid of sensing data, i.e., light detection and ranging (LiDAR) point clouds, the parameters of static scatterers, terrestrial dynamic scatterers, and aerial dynamic scatterers in the electromagnetic space, e.g., number, distance, angle, and power, are quantified under different TTD and ATD conditions in the physical environment. In the proposed model, the channel non-stationarity and consistency on the time and space domains and the channel non-stationarity on the frequency domain are simultaneously mimicked. The channel statistical properties, such as time-space-frequency correlation function (TSF-CF), time stationary interval (TSI), and Doppler power spectral density (DPSD), are derived and simulated. Simulation results match ray-tracing (RT) results well, which verifies the accuracy of the proposed multi-UAV-to-multi-vehicle channel model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.08825v1-abstract-full').style.display = 'none'; document.getElementById('2501.08825v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.07459">arXiv:2501.07459</a> <span> [<a href="https://arxiv.org/pdf/2501.07459">pdf</a>, <a href="https://arxiv.org/format/2501.07459">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> SynthSoM: A synthetic intelligent multi-modal sensing-communication dataset for Synesthesia of Machines (SoM) </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Cheng%2C+X">Xiang Cheng</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+Z">Ziwei Huang</a>, <a href="/search/eess?searchtype=author&query=Yu%2C+Y">Yong Yu</a>, <a href="/search/eess?searchtype=author&query=Bai%2C+L">Lu Bai</a>, <a href="/search/eess?searchtype=author&query=Sun%2C+M">Mingran Sun</a>, <a href="/search/eess?searchtype=author&query=Han%2C+Z">Zengrui Han</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+R">Ruide Zhang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+S">Sijiang Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.07459v1-abstract-short" style="display: inline;"> Given the importance of datasets for sensing-communication integration research, a novel simulation platform for constructing communication and multi-modal sensory dataset is developed. The developed platform integrates three high-precision software, i.e., AirSim, WaveFarer, and Wireless InSite, and further achieves in-depth integration and precise alignment of them. Based on the developed platfor… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.07459v1-abstract-full').style.display = 'inline'; document.getElementById('2501.07459v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.07459v1-abstract-full" style="display: none;"> Given the importance of datasets for sensing-communication integration research, a novel simulation platform for constructing communication and multi-modal sensory dataset is developed. The developed platform integrates three high-precision software, i.e., AirSim, WaveFarer, and Wireless InSite, and further achieves in-depth integration and precise alignment of them. Based on the developed platform, a new synthetic intelligent multi-modal sensing-communication dataset for Synesthesia of Machines (SoM), named SynthSoM, is proposed. The SynthSoM dataset contains various air-ground multi-link cooperative scenarios with comprehensive conditions, including multiple weather conditions, times of the day, intelligent agent densities, frequency bands, and antenna types. The SynthSoM dataset encompasses multiple data modalities, including radio-frequency (RF) channel large-scale and small-scale fading data, RF millimeter wave (mmWave) radar sensory data, and non-RF sensory data, e.g., RGB images, depth maps, and light detection and ranging (LiDAR) point clouds. The quality of SynthSoM dataset is validated via statistics-based qualitative inspection and evaluation metrics through machine learning (ML) via real-world measurements. The SynthSoM dataset is open-sourced and provides consistent data for cross-comparing SoM-related algorithms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.07459v1-abstract-full').style.display = 'none'; document.getElementById('2501.07459v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.07333">arXiv:2501.07333</a> <span> [<a href="https://arxiv.org/pdf/2501.07333">pdf</a>, <a href="https://arxiv.org/format/2501.07333">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Synesthesia of Machines Based Multi-Modal Intelligent V2V Channel Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Han%2C+Z">Zengrui Han</a>, <a href="/search/eess?searchtype=author&query=Bai%2C+L">Lu Bai</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+Z">Ziwei Huang</a>, <a href="/search/eess?searchtype=author&query=Cheng%2C+X">Xiang Cheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.07333v1-abstract-short" style="display: inline;"> This paper proposes a novel sixth-generation (6G) multi-modal intelligent vehicle-to-vehicle (V2V) channel model from light detection and ranging (LiDAR) point clouds based on Synesthesia of Machines (SoM). To explore the mapping relationship between physical environment and electromagnetic space, a new V2V high-fidelity mixed sensing-communication integration simulation dataset with different veh… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.07333v1-abstract-full').style.display = 'inline'; document.getElementById('2501.07333v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.07333v1-abstract-full" style="display: none;"> This paper proposes a novel sixth-generation (6G) multi-modal intelligent vehicle-to-vehicle (V2V) channel model from light detection and ranging (LiDAR) point clouds based on Synesthesia of Machines (SoM). To explore the mapping relationship between physical environment and electromagnetic space, a new V2V high-fidelity mixed sensing-communication integration simulation dataset with different vehicular traffic densities (VTDs) is constructed. Based on the constructed dataset, a novel scatterer recognition (ScaR) algorithm utilizing neural network SegNet is developed to recognize scatterer spatial attributes from LiDAR point clouds via SoM. In the developed ScaR algorithm, the mapping relationship between LiDAR point clouds and scatterers is explored, where the distribution of scatterers is obtained in the form of grid maps. Furthermore, scatterers are distinguished into dynamic and static scatterers based on LiDAR point cloud features, where parameters, e.g., distance, angle, and number, related to scatterers are determined. Through ScaR, dynamic and static scatterers change with the variation of LiDAR point clouds over time, which precisely models channel non-stationarity and consistency under different VTDs. Some important channel statistical properties, such as time-frequency correlation function (TF-CF) and Doppler power spectral density (DPSD), are obtained. Simulation results match well with ray-tracing (RT)-based results, thus demonstrating the necessity of exploring the mapping relationship and the utility of the proposed model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.07333v1-abstract-full').style.display = 'none'; document.getElementById('2501.07333v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.03461">arXiv:2501.03461</a> <span> [<a href="https://arxiv.org/pdf/2501.03461">pdf</a>, <a href="https://arxiv.org/format/2501.03461">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Radar Signal Recognition through Self-Supervised Learning and Domain Adaptation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Huang%2C+Z">Zi Huang</a>, <a href="/search/eess?searchtype=author&query=Denman%2C+S">Simon Denman</a>, <a href="/search/eess?searchtype=author&query=Pemasiri%2C+A">Akila Pemasiri</a>, <a href="/search/eess?searchtype=author&query=Fookes%2C+C">Clinton Fookes</a>, <a href="/search/eess?searchtype=author&query=Martin%2C+T">Terrence Martin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.03461v2-abstract-short" style="display: inline;"> Automatic radar signal recognition (RSR) plays a pivotal role in electronic warfare (EW), as accurately classifying radar signals is critical for informing decision-making processes. Recent advances in deep learning have shown significant potential in improving RSR performance in domains with ample annotated data. However, these methods fall short in EW scenarios where annotated RF data are scarce… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03461v2-abstract-full').style.display = 'inline'; document.getElementById('2501.03461v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.03461v2-abstract-full" style="display: none;"> Automatic radar signal recognition (RSR) plays a pivotal role in electronic warfare (EW), as accurately classifying radar signals is critical for informing decision-making processes. Recent advances in deep learning have shown significant potential in improving RSR performance in domains with ample annotated data. However, these methods fall short in EW scenarios where annotated RF data are scarce or impractical to obtain. To address these challenges, we introduce a self-supervised learning (SSL) method which utilises masked signal modelling and RF domain adaption to enhance RSR performance in environments with limited RF samples and labels. Specifically, we investigate pre-training masked autoencoders (MAE) on baseband in-phase and quadrature (I/Q) signals from various RF domains and subsequently transfer the learned representation to the radar domain, where annotated data are limited. Empirical results show that our lightweight self-supervised ResNet model with domain adaptation achieves up to a 17.5% improvement in 1-shot classification accuracy when pre-trained on in-domain signals (i.e., radar signals) and up to a 16.31% improvement when pre-trained on out-of-domain signals (i.e., comm signals), compared to its baseline without SSL. We also provide reference results for several MAE designs and pre-training strategies, establishing a new benchmark for few-shot radar signal classification. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03461v2-abstract-full').style.display = 'none'; document.getElementById('2501.03461v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 9 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.02530">arXiv:2501.02530</a> <span> [<a href="https://arxiv.org/pdf/2501.02530">pdf</a>, <a href="https://arxiv.org/format/2501.02530">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> UDMC: Unified Decision-Making and Control Framework for Urban Autonomous Driving with Motion Prediction of Traffic Participants </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Liu%2C+H">Haichao Liu</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+K">Kai Chen</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yulin Li</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+Z">Zhenmin Huang</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+M">Ming Liu</a>, <a href="/search/eess?searchtype=author&query=Ma%2C+J">Jun Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.02530v1-abstract-short" style="display: inline;"> Current autonomous driving systems often struggle to balance decision-making and motion control while ensuring safety and traffic rule compliance, especially in complex urban environments. Existing methods may fall short due to separate handling of these functionalities, leading to inefficiencies and safety compromises. To address these challenges, we introduce UDMC, an interpretable and unified L… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02530v1-abstract-full').style.display = 'inline'; document.getElementById('2501.02530v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.02530v1-abstract-full" style="display: none;"> Current autonomous driving systems often struggle to balance decision-making and motion control while ensuring safety and traffic rule compliance, especially in complex urban environments. Existing methods may fall short due to separate handling of these functionalities, leading to inefficiencies and safety compromises. To address these challenges, we introduce UDMC, an interpretable and unified Level 4 autonomous driving framework. UDMC integrates decision-making and motion control into a single optimal control problem (OCP), considering the dynamic interactions with surrounding vehicles, pedestrians, road lanes, and traffic signals. By employing innovative potential functions to model traffic participants and regulations, and incorporating a specialized motion prediction module, our framework enhances on-road safety and rule adherence. The integrated design allows for real-time execution of flexible maneuvers suited to diverse driving scenarios. High-fidelity simulations conducted in CARLA exemplify the framework's computational efficiency, robustness, and safety, resulting in superior driving performance when compared against various baseline models. Our open-source project is available at https://github.com/henryhcliu/udmc_carla.git. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02530v1-abstract-full').style.display = 'none'; document.getElementById('2501.02530v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.01773">arXiv:2501.01773</a> <span> [<a href="https://arxiv.org/pdf/2501.01773">pdf</a>, <a href="https://arxiv.org/format/2501.01773">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Compressed Domain Prior-Guided Video Super-Resolution for Cloud Gaming Content </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+Q">Qizhe Wang</a>, <a href="/search/eess?searchtype=author&query=Yin%2C+Q">Qian Yin</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+Z">Zhimeng Huang</a>, <a href="/search/eess?searchtype=author&query=Jiang%2C+W">Weijia Jiang</a>, <a href="/search/eess?searchtype=author&query=Su%2C+Y">Yi Su</a>, <a href="/search/eess?searchtype=author&query=Ma%2C+S">Siwei Ma</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+J">Jiaqi Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.01773v1-abstract-short" style="display: inline;"> Cloud gaming is an advanced form of Internet service that necessitates local terminals to decode within limited resources and time latency. Super-Resolution (SR) techniques are often employed on these terminals as an efficient way to reduce the required bit-rate bandwidth for cloud gaming. However, insufficient attention has been paid to SR of compressed game video content. Most SR networks amplif… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.01773v1-abstract-full').style.display = 'inline'; document.getElementById('2501.01773v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.01773v1-abstract-full" style="display: none;"> Cloud gaming is an advanced form of Internet service that necessitates local terminals to decode within limited resources and time latency. Super-Resolution (SR) techniques are often employed on these terminals as an efficient way to reduce the required bit-rate bandwidth for cloud gaming. However, insufficient attention has been paid to SR of compressed game video content. Most SR networks amplify block artifacts and ringing effects in decoded frames while ignoring edge details of game content, leading to unsatisfactory reconstruction results. In this paper, we propose a novel lightweight network called Coding Prior-Guided Super-Resolution (CPGSR) to address the SR challenges in compressed game video content. First, we design a Compressed Domain Guided Block (CDGB) to extract features of different depths from coding priors, which are subsequently integrated with features from the U-net backbone. Then, a series of re-parameterization blocks are utilized for reconstruction. Ultimately, inspired by the quantization in video coding, we propose a partitioned focal frequency loss to effectively guide the model's focus on preserving high-frequency information. Extensive experiments demonstrate the advancement of our approach. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.01773v1-abstract-full').style.display = 'none'; document.getElementById('2501.01773v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 4 figures, Data Compression Conference2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.12677">arXiv:2412.12677</a> <span> [<a href="https://arxiv.org/pdf/2412.12677">pdf</a>, <a href="https://arxiv.org/ps/2412.12677">ps</a>, <a href="https://arxiv.org/format/2412.12677">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> A Simplified Algorithm for Joint Real-Time Synchronization, NLoS Identification, and Multi-Agent Localization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Deng%2C+Y">Yili Deng</a>, <a href="/search/eess?searchtype=author&query=Fan%2C+J">Jie Fan</a>, <a href="/search/eess?searchtype=author&query=He%2C+J">Jiguang He</a>, <a href="/search/eess?searchtype=author&query=Luo%2C+B">Baojia Luo</a>, <a href="/search/eess?searchtype=author&query=Dong%2C+M">Miaomiao Dong</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+Z">Zhongyi Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.12677v1-abstract-short" style="display: inline;"> Real-time, high-precision localization in large-scale wireless networks faces two primary challenges: clock offsets caused by network asynchrony and non-line-of-sight (NLoS) conditions. To tackle these challenges, we propose a low-complexity real-time algorithm for joint synchronization and NLoS identification-based localization. For precise synchronization, we resolve clock offsets based on accum… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.12677v1-abstract-full').style.display = 'inline'; document.getElementById('2412.12677v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.12677v1-abstract-full" style="display: none;"> Real-time, high-precision localization in large-scale wireless networks faces two primary challenges: clock offsets caused by network asynchrony and non-line-of-sight (NLoS) conditions. To tackle these challenges, we propose a low-complexity real-time algorithm for joint synchronization and NLoS identification-based localization. For precise synchronization, we resolve clock offsets based on accumulated time-of-arrival measurements from all the past time instances, modeling it as a large-scale linear least squares (LLS) problem. To alleviate the high computational burden of solving this LLS, we introduce the blockwise recursive Moore-Penrose inverse (BRMP) technique, a generalized recursive least squares approach, and derive a simplified formulation of BRMP tailored specifically for the real-time synchronization problem. Furthermore, we formulate joint NLoS identification and localization as a robust least squares regression (RLSR) problem and address it by using an efficient iterative approach. Simulations show that the proposed algorithm achieves sub-nanosecond synchronization accuracy and centimeter-level localization precision, while maintaining low computational overhead. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.12677v1-abstract-full').style.display = 'none'; document.getElementById('2412.12677v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.00533">arXiv:2412.00533</a> <span> [<a href="https://arxiv.org/pdf/2412.00533">pdf</a>, <a href="https://arxiv.org/format/2412.00533">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Maintaining reliability while navigating unprecedented uncertainty: a synthesis of and guide to advances in electric sector resource adequacy </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Mantegna%2C+G">Gabriel Mantegna</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+Z">Ziting Huang</a>, <a href="/search/eess?searchtype=author&query=Van+Caelenberg%2C+G">Guillaume Van Caelenberg</a>, <a href="/search/eess?searchtype=author&query=Frew%2C+B">Bethany Frew</a>, <a href="/search/eess?searchtype=author&query=Lynch%2C+M">Muireann Lynch</a>, <a href="/search/eess?searchtype=author&query=O%27Malley%2C+M">Mark O'Malley</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.00533v1-abstract-short" style="display: inline;"> The reliability of the electric grid has in recent years become a larger concern for regulators, planners, and consumers due to several high-impact outage events, as well as the potential for even more impactful events in the future. These concerns are largely the result of decades-old resource adequacy (RA) planning frameworks being insufficiently adapted to the current types of uncertainty faced… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.00533v1-abstract-full').style.display = 'inline'; document.getElementById('2412.00533v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.00533v1-abstract-full" style="display: none;"> The reliability of the electric grid has in recent years become a larger concern for regulators, planners, and consumers due to several high-impact outage events, as well as the potential for even more impactful events in the future. These concerns are largely the result of decades-old resource adequacy (RA) planning frameworks being insufficiently adapted to the current types of uncertainty faced by planners, including many sources of deep uncertainty for which probability distributions cannot be defensibly assigned. There are emerging methodologies for dealing with these new types of uncertainty in RA assessment and procurement frameworks, but their adoption has been hindered by the lack of consistent understanding of terminology related to RA and the related concept of resilience, as well as a lack of syntheses of such available methodologies. Here we provide an overview of RA and its relationship to resilience, a summary of available methods for dealing with emerging types of uncertainty faced by RA assessment, and an an overview of procurement methodologies for operationalizing RA in the context of these types of uncertainty. This paper provides a synthesis and guide for both researchers and practitioners seeking to navigate a new, much more uncertain era of power system planning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.00533v1-abstract-full').style.display = 'none'; document.getElementById('2412.00533v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14525">arXiv:2411.14525</a> <span> [<a href="https://arxiv.org/pdf/2411.14525">pdf</a>, <a href="https://arxiv.org/format/2411.14525">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SegBook: A Simple Baseline and Cookbook for Volumetric Medical Image Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ye%2C+J">Jin Ye</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Ying Chen</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yanjun Li</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+H">Haoyu Wang</a>, <a href="/search/eess?searchtype=author&query=Deng%2C+Z">Zhongying Deng</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+Z">Ziyan Huang</a>, <a href="/search/eess?searchtype=author&query=Su%2C+Y">Yanzhou Su</a>, <a href="/search/eess?searchtype=author&query=Ma%2C+C">Chenglong Ma</a>, <a href="/search/eess?searchtype=author&query=Ji%2C+Y">Yuanfeng Ji</a>, <a href="/search/eess?searchtype=author&query=He%2C+J">Junjun He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14525v1-abstract-short" style="display: inline;"> Computed Tomography (CT) is one of the most popular modalities for medical imaging. By far, CT images have contributed to the largest publicly available datasets for volumetric medical segmentation tasks, covering full-body anatomical structures. Large amounts of full-body CT images provide the opportunity to pre-train powerful models, e.g., STU-Net pre-trained in a supervised fashion, to segment… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14525v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14525v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14525v1-abstract-full" style="display: none;"> Computed Tomography (CT) is one of the most popular modalities for medical imaging. By far, CT images have contributed to the largest publicly available datasets for volumetric medical segmentation tasks, covering full-body anatomical structures. Large amounts of full-body CT images provide the opportunity to pre-train powerful models, e.g., STU-Net pre-trained in a supervised fashion, to segment numerous anatomical structures. However, it remains unclear in which conditions these pre-trained models can be transferred to various downstream medical segmentation tasks, particularly segmenting the other modalities and diverse targets. To address this problem, a large-scale benchmark for comprehensive evaluation is crucial for finding these conditions. Thus, we collected 87 public datasets varying in modality, target, and sample size to evaluate the transfer ability of full-body CT pre-trained models. We then employed a representative model, STU-Net with multiple model scales, to conduct transfer learning across modalities and targets. Our experimental results show that (1) there may be a bottleneck effect concerning the dataset size in fine-tuning, with more improvement on both small- and large-scale datasets than medium-size ones. (2) Models pre-trained on full-body CT demonstrate effective modality transfer, adapting well to other modalities such as MRI. (3) Pre-training on the full-body CT not only supports strong performance in structure detection but also shows efficacy in lesion detection, showcasing adaptability across target tasks. We hope that this large-scale open evaluation of transfer learning can direct future research in volumetric medical image segmentation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14525v1-abstract-full').style.display = 'none'; document.getElementById('2411.14525v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13602">arXiv:2411.13602</a> <span> [<a href="https://arxiv.org/pdf/2411.13602">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Large-scale cross-modality pretrained model enhances cardiovascular state estimation and cardiomyopathy detection from electrocardiograms: An AI system development and multi-center validation study </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ding%2C+Z">Zhengyao Ding</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+Y">Yujian Hu</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+Y">Youyao Xu</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+C">Chengchen Zhao</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Z">Ziyu Li</a>, <a href="/search/eess?searchtype=author&query=Mao%2C+Y">Yiheng Mao</a>, <a href="/search/eess?searchtype=author&query=Li%2C+H">Haitao Li</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Q">Qian Li</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+J">Jing Wang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yue Chen</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+M">Mengjia Chen</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+L">Longbo Wang</a>, <a href="/search/eess?searchtype=author&query=Chu%2C+X">Xuesen Chu</a>, <a href="/search/eess?searchtype=author&query=Pan%2C+W">Weichao Pan</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+Z">Ziyi Liu</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+F">Fei Wu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+H">Hongkun Zhang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+T">Ting Chen</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+Z">Zhengxing Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13602v1-abstract-short" style="display: inline;"> Cardiovascular diseases (CVDs) present significant challenges for early and accurate diagnosis. While cardiac magnetic resonance imaging (CMR) is the gold standard for assessing cardiac function and diagnosing CVDs, its high cost and technical complexity limit accessibility. In contrast, electrocardiography (ECG) offers promise for large-scale early screening. This study introduces CardiacNets, an… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13602v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13602v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13602v1-abstract-full" style="display: none;"> Cardiovascular diseases (CVDs) present significant challenges for early and accurate diagnosis. While cardiac magnetic resonance imaging (CMR) is the gold standard for assessing cardiac function and diagnosing CVDs, its high cost and technical complexity limit accessibility. In contrast, electrocardiography (ECG) offers promise for large-scale early screening. This study introduces CardiacNets, an innovative model that enhances ECG analysis by leveraging the diagnostic strengths of CMR through cross-modal contrastive learning and generative pretraining. CardiacNets serves two primary functions: (1) it evaluates detailed cardiac function indicators and screens for potential CVDs, including coronary artery disease, cardiomyopathy, pericarditis, heart failure and pulmonary hypertension, using ECG input; and (2) it enhances interpretability by generating high-quality CMR images from ECG data. We train and validate the proposed CardiacNets on two large-scale public datasets (the UK Biobank with 41,519 individuals and the MIMIC-IV-ECG comprising 501,172 samples) as well as three private datasets (FAHZU with 410 individuals, SAHZU with 464 individuals, and QPH with 338 individuals), and the findings demonstrate that CardiacNets consistently outperforms traditional ECG-only models, substantially improving screening accuracy. Furthermore, the generated CMR images provide valuable diagnostic support for physicians of all experience levels. This proof-of-concept study highlights how ECG can facilitate cross-modal insights into cardiac function assessment, paving the way for enhanced CVD screening and diagnosis at a population level. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13602v1-abstract-full').style.display = 'none'; document.getElementById('2411.13602v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">23 pages, 8 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.08570">arXiv:2411.08570</a> <span> [<a href="https://arxiv.org/pdf/2411.08570">pdf</a>, <a href="https://arxiv.org/format/2411.08570">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Electromagnetic Modeling and Capacity Analysis of Rydberg Atom-Based MIMO System </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yuan%2C+S+S+A">Shuai S. A. Yuan</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+X+Y+I">Xinyi Y. I. Xu</a>, <a href="/search/eess?searchtype=author&query=Yuan%2C+J">Jinpeng Yuan</a>, <a href="/search/eess?searchtype=author&query=Xie%2C+G">Guoda Xie</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+C">Chongwen Huang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+X">Xiaoming Chen</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+Z">Zhixiang Huang</a>, <a href="/search/eess?searchtype=author&query=Sha%2C+W+E+I">Wei E. I. Sha</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.08570v1-abstract-short" style="display: inline;"> Rydberg atom-based antennas exploit the quantum properties of highly excited Rydberg atoms, providing unique advantages over classical antennas, such as high sensitivity, broad frequency range, and compact size. Despite the increasing interests in their applications in antenna and communication engineering, two key properties, involving the lack of polarization multiplexing and isotropic reception… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08570v1-abstract-full').style.display = 'inline'; document.getElementById('2411.08570v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.08570v1-abstract-full" style="display: none;"> Rydberg atom-based antennas exploit the quantum properties of highly excited Rydberg atoms, providing unique advantages over classical antennas, such as high sensitivity, broad frequency range, and compact size. Despite the increasing interests in their applications in antenna and communication engineering, two key properties, involving the lack of polarization multiplexing and isotropic reception without mutual coupling, remain unexplored in the analysis of Rydberg atom-based spatial multiplexing, i.e., multiple-input and multiple-output (MIMO), communications. Generally, the design considerations for any antenna, even for atomic ones, can be extracted to factors such as radiation patterns, efficiency, and polarization, allowing them to be seamlessly integrated into existing system models. In this letter, we extract the antenna properties from relevant quantum characteristics, enabling electromagnetic modeling and capacity analysis of Rydberg MIMO systems in both far-field and near-field scenarios. By employing ray-based method for far-field analysis and dyadic Green's function for near-field calculation, our results indicate that Rydberg atom-based antenna arrays offer specific advantages over classical dipole-type arrays in single-polarization MIMO communications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08570v1-abstract-full').style.display = 'none'; document.getElementById('2411.08570v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07683">arXiv:2411.07683</a> <span> [<a href="https://arxiv.org/pdf/2411.07683">pdf</a>, <a href="https://arxiv.org/format/2411.07683">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Hybrid Channel Modeling and Environment Reconstruction for Terahertz Monostatic Sensing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Lyu%2C+Y">Yejian Lyu</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+Z">Zeyu Huang</a>, <a href="/search/eess?searchtype=author&query=Schwarz%2C+S">Stefan Schwarz</a>, <a href="/search/eess?searchtype=author&query=Han%2C+C">Chong Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07683v1-abstract-short" style="display: inline;"> THz ISAC aims to integrate novel functionalities, such as positioning and environmental sensing, into communication systems. Accurate channel modeling is crucial for the design and performance evaluation of future ISAC systems. In this paper, a THz measurement campaign for monostatic sensing is presented. VNA-based channel measurements are conducted in a laboratory scenario, where the transmitter… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07683v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07683v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07683v1-abstract-full" style="display: none;"> THz ISAC aims to integrate novel functionalities, such as positioning and environmental sensing, into communication systems. Accurate channel modeling is crucial for the design and performance evaluation of future ISAC systems. In this paper, a THz measurement campaign for monostatic sensing is presented. VNA-based channel measurements are conducted in a laboratory scenario, where the transmitter and receiver are positioned together to mimic monostatic sensing. The centering frequency and measured bandwidth for these measurements are 300 GHz and 20 GHz, respectively. A DSS scheme is employed to capture spatial sensing channel profiles. Measurements are conducted across 28 transceiver locations arranged along an 'L'-shaped route. Then, an element-wise SAGE algorithm is used to estimate the MPC parameters, i.e., amplitude and delay. Specular and diffuse reflections are analyzed based on geometric principles and the estimated MPC parameters, where the effects from the radiation pattern are observed. A geometry-based MPC trajectory tracking algorithm is then proposed to classify the MPCs and de-embed the effects of the radiation pattern. Following this algorithm, a hybrid channel model is proposed based on the de-embedded MPC parameters. In this hybrid channel model for monostatic sensing, the MPCs are categorized into target-related and environment-related components. The target-related components are utilized for target detection and identification, while the environment-related ones focus on geometrical scenario reconstruction. A demonstration of geometrical environment reconstruction, along with an analysis of reflection loss for target identification, is subsequently presented. This work offers valuable insights into THz monostatic sensing channel modeling and the design of future THz ISAC systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07683v1-abstract-full').style.display = 'none'; document.getElementById('2411.07683v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05027">arXiv:2411.05027</a> <span> [<a href="https://arxiv.org/pdf/2411.05027">pdf</a>, <a href="https://arxiv.org/format/2411.05027">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/MGRS.2024.3483459">10.1109/MGRS.2024.3483459 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Generative Artificial Intelligence Meets Synthetic Aperture Radar: A Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Huang%2C+Z">Zhongling Huang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+X">Xidan Zhang</a>, <a href="/search/eess?searchtype=author&query=Tang%2C+Z">Zuqian Tang</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+F">Feng Xu</a>, <a href="/search/eess?searchtype=author&query=Datcu%2C+M">Mihai Datcu</a>, <a href="/search/eess?searchtype=author&query=Han%2C+J">Junwei Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05027v1-abstract-short" style="display: inline;"> SAR images possess unique attributes that present challenges for both human observers and vision AI models to interpret, owing to their electromagnetic characteristics. The interpretation of SAR images encounters various hurdles, with one of the primary obstacles being the data itself, which includes issues related to both the quantity and quality of the data. The challenges can be addressed using… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05027v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05027v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05027v1-abstract-full" style="display: none;"> SAR images possess unique attributes that present challenges for both human observers and vision AI models to interpret, owing to their electromagnetic characteristics. The interpretation of SAR images encounters various hurdles, with one of the primary obstacles being the data itself, which includes issues related to both the quantity and quality of the data. The challenges can be addressed using generative AI technologies. Generative AI, often known as GenAI, is a very advanced and powerful technology in the field of artificial intelligence that has gained significant attention. The advancement has created possibilities for the creation of texts, photorealistic pictures, videos, and material in various modalities. This paper aims to comprehensively investigate the intersection of GenAI and SAR. First, we illustrate the common data generation-based applications in SAR field and compare them with computer vision tasks, analyzing the similarity, difference, and general challenges of them. Then, an overview of the latest GenAI models is systematically reviewed, including various basic models and their variations targeting the general challenges. Additionally, the corresponding applications in SAR domain are also included. Specifically, we propose to summarize the physical model based simulation approaches for SAR, and analyze the hybrid modeling methods that combine the GenAI and interpretable models. The evaluation methods that have been or could be applied to SAR, are also explored. Finally, the potential challenges and future prospects are discussed. To our best knowledge, this survey is the first exhaustive examination of the interdiscipline of SAR and GenAI, encompassing a wide range of topics, including deep neural networks, physical models, computer vision, and SAR images. The resources of this survey are open-source at \url{https://github.com/XAI4SAR/GenAIxSAR}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05027v1-abstract-full').style.display = 'none'; document.getElementById('2411.05027v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03711">arXiv:2411.03711</a> <span> [<a href="https://arxiv.org/pdf/2411.03711">pdf</a>, <a href="https://arxiv.org/format/2411.03711">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Multi-Modal Intelligent Channel Modeling: A New Modeling Paradigm via Synesthesia of Machines </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Bai%2C+L">Lu Bai</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+Z">Ziwei Huang</a>, <a href="/search/eess?searchtype=author&query=Sun%2C+M">Mingran Sun</a>, <a href="/search/eess?searchtype=author&query=Cheng%2C+X">Xiang Cheng</a>, <a href="/search/eess?searchtype=author&query=Cui%2C+L">Lizhen Cui</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03711v1-abstract-short" style="display: inline;"> In the future sixth-generation (6G) era, to support accurate localization sensing and efficient communication link establishment for intelligent agents, a comprehensive understanding of the surrounding environment and proper channel modeling are indispensable. The existing method, which solely exploits radio frequency (RF) communication information, is difficult to accomplish accurate channel mode… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03711v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03711v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03711v1-abstract-full" style="display: none;"> In the future sixth-generation (6G) era, to support accurate localization sensing and efficient communication link establishment for intelligent agents, a comprehensive understanding of the surrounding environment and proper channel modeling are indispensable. The existing method, which solely exploits radio frequency (RF) communication information, is difficult to accomplish accurate channel modeling. Fortunately, multi-modal devices are deployed on intelligent agents to obtain environmental features, which could further assist in channel modeling. Currently, some research efforts have been devoted to utilizing multi-modal information to facilitate channel modeling, while still lack a comprehensive review. To fill this gap, we embark on an initial endeavor with the goal of reviewing multi-modal intelligent channel modeling (MMICM) via Synesthesia of Machines (SoM). Compared to channel modeling approaches that solely utilize RF communication information, the utilization of multi-modal information can provide a more in-depth understanding of the propagation environment around the transceiver, thus facilitating more accurate channel modeling. First, this paper introduces existing channel modeling approaches from the perspective of the channel modeling evolution. Then, we have elaborated and investigated recent advances in the topic of capturing typical channel characteristics and features, i.e., channel non-stationarity and consistency, by characterizing the mathematical, spatial, coupling, and mapping relationships. In addition, applications that can be supported by MMICM are summarized and analyzed. To corroborate the superiority of MMICM via SoM, we give the simulation result and analysis. Finally, some open issues and potential directions for the MMICM are outlined from the perspectives of measurements, modeling, and applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03711v1-abstract-full').style.display = 'none'; document.getElementById('2411.03711v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20466">arXiv:2410.20466</a> <span> [<a href="https://arxiv.org/pdf/2410.20466">pdf</a>, <a href="https://arxiv.org/format/2410.20466">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Guidance Disentanglement Network for Optics-Guided Thermal UAV Image Super-Resolution </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhao%2C+Z">Zhicheng Zhao</a>, <a href="/search/eess?searchtype=author&query=Gu%2C+J">Juanjuan Gu</a>, <a href="/search/eess?searchtype=author&query=Li%2C+C">Chenglong Li</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+C">Chun Wang</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+Z">Zhongling Huang</a>, <a href="/search/eess?searchtype=author&query=Tang%2C+J">Jin Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20466v1-abstract-short" style="display: inline;"> Optics-guided Thermal UAV image Super-Resolution (OTUAV-SR) has attracted significant research interest due to its potential applications in security inspection, agricultural measurement, and object detection. Existing methods often employ single guidance model to generate the guidance features from optical images to assist thermal UAV images super-resolution. However, single guidance models make… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20466v1-abstract-full').style.display = 'inline'; document.getElementById('2410.20466v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20466v1-abstract-full" style="display: none;"> Optics-guided Thermal UAV image Super-Resolution (OTUAV-SR) has attracted significant research interest due to its potential applications in security inspection, agricultural measurement, and object detection. Existing methods often employ single guidance model to generate the guidance features from optical images to assist thermal UAV images super-resolution. However, single guidance models make it difficult to generate effective guidance features under favorable and adverse conditions in UAV scenarios, thus limiting the performance of OTUAV-SR. To address this issue, we propose a novel Guidance Disentanglement network (GDNet), which disentangles the optical image representation according to typical UAV scenario attributes to form guidance features under both favorable and adverse conditions, for robust OTUAV-SR. Moreover, we design an attribute-aware fusion module to combine all attribute-based optical guidance features, which could form a more discriminative representation and fit the attribute-agnostic guidance process. To facilitate OTUAV-SR research in complex UAV scenarios, we introduce VGTSR2.0, a large-scale benchmark dataset containing 3,500 aligned optical-thermal image pairs captured under diverse conditions and scenes. Extensive experiments on VGTSR2.0 demonstrate that GDNet significantly improves OTUAV-SR performance over state-of-the-art methods, especially in the challenging low-light and foggy environments commonly encountered in UAV scenarios. The dataset and code will be publicly available at https://github.com/Jocelyney/GDNet. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20466v1-abstract-full').style.display = 'none'; document.getElementById('2410.20466v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18 pages, 19 figures, 8 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18784">arXiv:2410.18784</a> <span> [<a href="https://arxiv.org/pdf/2410.18784">pdf</a>, <a href="https://arxiv.org/ps/2410.18784">ps</a>, <a href="https://arxiv.org/format/2410.18784">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Numerical Analysis">math.NA</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Statistics Theory">math.ST</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Denoising diffusion probabilistic models are optimally adaptive to unknown low dimensionality </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Huang%2C+Z">Zhihan Huang</a>, <a href="/search/eess?searchtype=author&query=Wei%2C+Y">Yuting Wei</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yuxin Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18784v2-abstract-short" style="display: inline;"> The denoising diffusion probabilistic model (DDPM) has emerged as a mainstream generative model in generative AI. While sharp convergence guarantees have been established for the DDPM, the iteration complexity is, in general, proportional to the ambient data dimension, resulting in overly conservative theory that fails to explain its practical efficiency. This has motivated the recent work Li and… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18784v2-abstract-full').style.display = 'inline'; document.getElementById('2410.18784v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18784v2-abstract-full" style="display: none;"> The denoising diffusion probabilistic model (DDPM) has emerged as a mainstream generative model in generative AI. While sharp convergence guarantees have been established for the DDPM, the iteration complexity is, in general, proportional to the ambient data dimension, resulting in overly conservative theory that fails to explain its practical efficiency. This has motivated the recent work Li and Yan (2024a) to investigate how the DDPM can achieve sampling speed-ups through automatic exploitation of intrinsic low dimensionality of data. We strengthen this line of work by demonstrating, in some sense, optimal adaptivity to unknown low dimensionality. For a broad class of data distributions with intrinsic dimension $k$, we prove that the iteration complexity of the DDPM scales nearly linearly with $k$, which is optimal when using KL divergence to measure distributional discrepancy. Notably, our work is closely aligned with the independent concurrent work Potaptchik et al. (2024) -- posted two weeks prior to ours -- in establishing nearly linear-$k$ convergence guarantees for the DDPM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18784v2-abstract-full').style.display = 'none'; document.getElementById('2410.18784v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.17084">arXiv:2410.17084</a> <span> [<a href="https://arxiv.org/pdf/2410.17084">pdf</a>, <a href="https://arxiv.org/format/2410.17084">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> GS-LIVM: Real-Time Photo-Realistic LiDAR-Inertial-Visual Mapping with Gaussian Splatting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Xie%2C+Y">Yusen Xie</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+Z">Zhenmin Huang</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+J">Jin Wu</a>, <a href="/search/eess?searchtype=author&query=Ma%2C+J">Jun Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.17084v1-abstract-short" style="display: inline;"> In this paper, we introduce GS-LIVM, a real-time photo-realistic LiDAR-Inertial-Visual mapping framework with Gaussian Splatting tailored for outdoor scenes. Compared to existing methods based on Neural Radiance Fields (NeRF) and 3D Gaussian Splatting (3DGS), our approach enables real-time photo-realistic mapping while ensuring high-quality image rendering in large-scale unbounded outdoor environm… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17084v1-abstract-full').style.display = 'inline'; document.getElementById('2410.17084v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.17084v1-abstract-full" style="display: none;"> In this paper, we introduce GS-LIVM, a real-time photo-realistic LiDAR-Inertial-Visual mapping framework with Gaussian Splatting tailored for outdoor scenes. Compared to existing methods based on Neural Radiance Fields (NeRF) and 3D Gaussian Splatting (3DGS), our approach enables real-time photo-realistic mapping while ensuring high-quality image rendering in large-scale unbounded outdoor environments. In this work, Gaussian Process Regression (GPR) is employed to mitigate the issues resulting from sparse and unevenly distributed LiDAR observations. The voxel-based 3D Gaussians map representation facilitates real-time dense mapping in large outdoor environments with acceleration governed by custom CUDA kernels. Moreover, the overall framework is designed in a covariance-centered manner, where the estimated covariance is used to initialize the scale and rotation of 3D Gaussians, as well as update the parameters of the GPR. We evaluate our algorithm on several outdoor datasets, and the results demonstrate that our method achieves state-of-the-art performance in terms of mapping efficiency and rendering quality. The source code is available on GitHub. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17084v1-abstract-full').style.display = 'none'; document.getElementById('2410.17084v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages, 13 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.18701">arXiv:2409.18701</a> <span> [<a href="https://arxiv.org/pdf/2409.18701">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> 3DPX: Single Panoramic X-ray Analysis Guided by 3D Oral Structure Reconstruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+X">Xiaoshuang Li</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+Z">Zimo Huang</a>, <a href="/search/eess?searchtype=author&query=Meng%2C+M">Mingyuan Meng</a>, <a href="/search/eess?searchtype=author&query=Delamare%2C+E">Eduardo Delamare</a>, <a href="/search/eess?searchtype=author&query=Feng%2C+D">Dagan Feng</a>, <a href="/search/eess?searchtype=author&query=Bi%2C+L">Lei Bi</a>, <a href="/search/eess?searchtype=author&query=Sheng%2C+B">Bin Sheng</a>, <a href="/search/eess?searchtype=author&query=Jiang%2C+L">Lingyong Jiang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+B">Bo Li</a>, <a href="/search/eess?searchtype=author&query=Kim%2C+J">Jinman Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.18701v1-abstract-short" style="display: inline;"> Panoramic X-ray (PX) is a prevalent modality in dentistry practice owing to its wide availability and low cost. However, as a 2D projection of a 3D structure, PX suffers from anatomical information loss and PX diagnosis is limited compared to that with 3D imaging modalities. 2D-to-3D reconstruction methods have been explored for the ability to synthesize the absent 3D anatomical information from 2… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18701v1-abstract-full').style.display = 'inline'; document.getElementById('2409.18701v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.18701v1-abstract-full" style="display: none;"> Panoramic X-ray (PX) is a prevalent modality in dentistry practice owing to its wide availability and low cost. However, as a 2D projection of a 3D structure, PX suffers from anatomical information loss and PX diagnosis is limited compared to that with 3D imaging modalities. 2D-to-3D reconstruction methods have been explored for the ability to synthesize the absent 3D anatomical information from 2D PX for use in PX image analysis. However, there are challenges in leveraging such 3D synthesized reconstructions. First, inferring 3D depth from 2D images remains a challenging task with limited accuracy. The second challenge is the joint analysis of 2D PX with its 3D synthesized counterpart, with the aim to maximize the 2D-3D synergy while minimizing the errors arising from the synthesized image. In this study, we propose a new method termed 3DPX - PX image analysis guided by 2D-to-3D reconstruction, to overcome these challenges. 3DPX consists of (i) a novel progressive reconstruction network to improve 2D-to-3D reconstruction and, (ii) a contrastive-guided bidirectional multimodality alignment module for 3D-guided 2D PX classification and segmentation tasks. The reconstruction network progressively reconstructs 3D images with knowledge imposed on the intermediate reconstructions at multiple pyramid levels and incorporates Multilayer Perceptrons to improve semantic understanding. The downstream networks leverage the reconstructed images as 3D anatomical guidance to the PX analysis through feature alignment, which increases the 2D-3D synergy with bidirectional feature projection and decease the impact of potential errors with contrastive guidance. Extensive experiments on two oral datasets involving 464 studies demonstrate that 3DPX outperforms the state-of-the-art methods in various tasks including 2D-to-3D reconstruction, PX classification and lesion segmentation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18701v1-abstract-full').style.display = 'none'; document.getElementById('2409.18701v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.16661">arXiv:2409.16661</a> <span> [<a href="https://arxiv.org/pdf/2409.16661">pdf</a>, <a href="https://arxiv.org/ps/2409.16661">ps</a>, <a href="https://arxiv.org/format/2409.16661">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Morphological-consistent Diffusion Network for Ultrasound Coronal Image Enhancement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhou%2C+Y">Yihao Zhou</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+Z">Zixun Huang</a>, <a href="/search/eess?searchtype=author&query=Lee%2C+T+T">Timothy Tin-Yan Lee</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+C">Chonglin Wu</a>, <a href="/search/eess?searchtype=author&query=Lai%2C+K+K">Kelly Ka-Lee Lai</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+D">De Yang</a>, <a href="/search/eess?searchtype=author&query=Hung%2C+A+L">Alec Lik-hang Hung</a>, <a href="/search/eess?searchtype=author&query=Cheng%2C+J+C">Jack Chun-Yiu Cheng</a>, <a href="/search/eess?searchtype=author&query=Lam%2C+T">Tsz-Ping Lam</a>, <a href="/search/eess?searchtype=author&query=Zheng%2C+Y">Yong-ping Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.16661v1-abstract-short" style="display: inline;"> Ultrasound curve angle (UCA) measurement provides a radiation-free and reliable evaluation for scoliosis based on ultrasound imaging. However, degraded image quality, especially in difficult-to-image patients, can prevent clinical experts from making confident measurements, even leading to misdiagnosis. In this paper, we propose a multi-stage image enhancement framework that models high-quality im… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.16661v1-abstract-full').style.display = 'inline'; document.getElementById('2409.16661v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.16661v1-abstract-full" style="display: none;"> Ultrasound curve angle (UCA) measurement provides a radiation-free and reliable evaluation for scoliosis based on ultrasound imaging. However, degraded image quality, especially in difficult-to-image patients, can prevent clinical experts from making confident measurements, even leading to misdiagnosis. In this paper, we propose a multi-stage image enhancement framework that models high-quality image distribution via a diffusion-based model. Specifically, we integrate the underlying morphological information from images taken at different depths of the 3D volume to calibrate the reverse process toward high-quality and high-fidelity image generation. This is achieved through a fusion operation with a learnable tuner module that learns the multi-to-one mapping from multi-depth to high-quality images. Moreover, the separate learning of the high-quality image distribution and the spinal features guarantees the preservation of consistent spinal pose descriptions in the generated images, which is crucial in evaluating spinal deformities. Remarkably, our proposed enhancement algorithm significantly outperforms other enhancement-based methods on ultrasound images in terms of image quality. Ultimately, we conduct the intra-rater and inter-rater measurements of UCA and higher ICC (0.91 and 0.89 for thoracic and lumbar angles) on enhanced images, indicating our method facilitates the measurement of ultrasound curve angles and offers promising prospects for automated scoliosis diagnosis. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.16661v1-abstract-full').style.display = 'none'; document.getElementById('2409.16661v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.15353">arXiv:2409.15353</a> <span> [<a href="https://arxiv.org/pdf/2409.15353">pdf</a>, <a href="https://arxiv.org/format/2409.15353">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Contextualization of ASR with LLM using phonetic retrieval-based augmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Lei%2C+Z">Zhihong Lei</a>, <a href="/search/eess?searchtype=author&query=Na%2C+X">Xingyu Na</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+M">Mingbin Xu</a>, <a href="/search/eess?searchtype=author&query=Pusateri%2C+E">Ernest Pusateri</a>, <a href="/search/eess?searchtype=author&query=Van+Gysel%2C+C">Christophe Van Gysel</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Y">Yuanyuan Zhang</a>, <a href="/search/eess?searchtype=author&query=Han%2C+S">Shiyi Han</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+Z">Zhen Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.15353v1-abstract-short" style="display: inline;"> Large language models (LLMs) have shown superb capability of modeling multimodal signals including audio and text, allowing the model to generate spoken or textual response given a speech input. However, it remains a challenge for the model to recognize personal named entities, such as contacts in a phone book, when the input modality is speech. In this work, we start with a speech recognition tas… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15353v1-abstract-full').style.display = 'inline'; document.getElementById('2409.15353v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.15353v1-abstract-full" style="display: none;"> Large language models (LLMs) have shown superb capability of modeling multimodal signals including audio and text, allowing the model to generate spoken or textual response given a speech input. However, it remains a challenge for the model to recognize personal named entities, such as contacts in a phone book, when the input modality is speech. In this work, we start with a speech recognition task and propose a retrieval-based solution to contextualize the LLM: we first let the LLM detect named entities in speech without any context, then use this named entity as a query to retrieve phonetically similar named entities from a personal database and feed them to the LLM, and finally run context-aware LLM decoding. In a voice assistant task, our solution achieved up to 30.2% relative word error rate reduction and 73.6% relative named entity error rate reduction compared to a baseline system without contextualization. Notably, our solution by design avoids prompting the LLM with the full named entity database, making it highly efficient and applicable to large named entity databases. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15353v1-abstract-full').style.display = 'none'; document.getElementById('2409.15353v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.12308">arXiv:2409.12308</a> <span> [<a href="https://arxiv.org/pdf/2409.12308">pdf</a>, <a href="https://arxiv.org/ps/2409.12308">ps</a>, <a href="https://arxiv.org/format/2409.12308">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Robust DOA Estimation Based on Dual Lawson Norm for RIS-Aided Wireless Communication Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yu%2C+C">Canping Yu</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yingsong Li</a>, <a href="/search/eess?searchtype=author&query=Li%2C+L">Liping Li</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+Z">Zhixiang Huang</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+Q">Qingqing Wu</a>, <a href="/search/eess?searchtype=author&query=de+Lamare%2C+R+C">Rodrigo C. de Lamare</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.12308v1-abstract-short" style="display: inline;"> Reconfigurable intelligent surfaces (RIS) can actively perform beamforming and have become a crucial enabler for wireless systems in the future. The direction-of-arrival (DOA) estimates of RIS received signals can help design the reflection control matrix and improve communication quality. In this paper, we design a RIS-assisted system and propose a robust Lawson norm-based multiple-signal-classif… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12308v1-abstract-full').style.display = 'inline'; document.getElementById('2409.12308v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.12308v1-abstract-full" style="display: none;"> Reconfigurable intelligent surfaces (RIS) can actively perform beamforming and have become a crucial enabler for wireless systems in the future. The direction-of-arrival (DOA) estimates of RIS received signals can help design the reflection control matrix and improve communication quality. In this paper, we design a RIS-assisted system and propose a robust Lawson norm-based multiple-signal-classification (LN-MUSIC) DOA estimation algorithm for impulsive noise, which is divided into two parts. The first one, the non-convex Lawson norm is used as the error criterion along with a regularization constraint to formulate the optimization problem. Then, a Bregman distance based alternating direction method of multipliers is used to solve the problem and recover the desired signal. The second part is to use the multiple signal classification (MUSIC) to find out the DOAs of targets based on their sparsity in the spatial domain. In addition, we also propose a RIS control matrix optimization strategy that requires no channel state information, which effectively enhances the desired signals and improves the performance of the LN-MUSIC algorithm. A Cramer-Rao-lower-bound (CRLB) of the proposed DOA estimation algorithm is presented and verifies its feasibility. Simulated results show that the proposed robust DOA estimation algorithm based on the Lawson norm can effectively suppress the impact of large outliers caused by impulsive noise on the estimation results, outperforming existing methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12308v1-abstract-full').style.display = 'none'; document.getElementById('2409.12308v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 figures, 28 pages</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> -- </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.09214">arXiv:2409.09214</a> <span> [<a href="https://arxiv.org/pdf/2409.09214">pdf</a>, <a href="https://arxiv.org/format/2409.09214">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Seed-Music: A Unified Framework for High Quality and Controlled Music Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Bai%2C+Y">Ye Bai</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+H">Haonan Chen</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+J">Jitong Chen</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Z">Zhuo Chen</a>, <a href="/search/eess?searchtype=author&query=Deng%2C+Y">Yi Deng</a>, <a href="/search/eess?searchtype=author&query=Dong%2C+X">Xiaohong Dong</a>, <a href="/search/eess?searchtype=author&query=Hantrakul%2C+L">Lamtharn Hantrakul</a>, <a href="/search/eess?searchtype=author&query=Hao%2C+W">Weituo Hao</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+Q">Qingqing Huang</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+Z">Zhongyi Huang</a>, <a href="/search/eess?searchtype=author&query=Jia%2C+D">Dongya Jia</a>, <a href="/search/eess?searchtype=author&query=La%2C+F">Feihu La</a>, <a href="/search/eess?searchtype=author&query=Le%2C+D">Duc Le</a>, <a href="/search/eess?searchtype=author&query=Li%2C+B">Bochen Li</a>, <a href="/search/eess?searchtype=author&query=Li%2C+C">Chumin Li</a>, <a href="/search/eess?searchtype=author&query=Li%2C+H">Hui Li</a>, <a href="/search/eess?searchtype=author&query=Li%2C+X">Xingxing Li</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+S">Shouda Liu</a>, <a href="/search/eess?searchtype=author&query=Lu%2C+W">Wei-Tsung Lu</a>, <a href="/search/eess?searchtype=author&query=Lu%2C+Y">Yiqing Lu</a>, <a href="/search/eess?searchtype=author&query=Shaw%2C+A">Andrew Shaw</a>, <a href="/search/eess?searchtype=author&query=Spijkervet%2C+J">Janne Spijkervet</a>, <a href="/search/eess?searchtype=author&query=Sun%2C+Y">Yakun Sun</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+B">Bo Wang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+J">Ju-Chiang Wang</a> , et al. (13 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.09214v3-abstract-short" style="display: inline;"> We introduce Seed-Music, a suite of music generation systems capable of producing high-quality music with fine-grained style control. Our unified framework leverages both auto-regressive language modeling and diffusion approaches to support two key music creation workflows: controlled music generation and post-production editing. For controlled music generation, our system enables vocal music gene… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09214v3-abstract-full').style.display = 'inline'; document.getElementById('2409.09214v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.09214v3-abstract-full" style="display: none;"> We introduce Seed-Music, a suite of music generation systems capable of producing high-quality music with fine-grained style control. Our unified framework leverages both auto-regressive language modeling and diffusion approaches to support two key music creation workflows: controlled music generation and post-production editing. For controlled music generation, our system enables vocal music generation with performance controls from multi-modal inputs, including style descriptions, audio references, musical scores, and voice prompts. For post-production editing, it offers interactive tools for editing lyrics and vocal melodies directly in the generated audio. We encourage readers to listen to demo audio examples at https://team.doubao.com/seed-music "https://team.doubao.com/seed-music". <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09214v3-abstract-full').style.display = 'none'; document.getElementById('2409.09214v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Seed-Music technical report, 20 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.08628">arXiv:2409.08628</a> <span> [<a href="https://arxiv.org/pdf/2409.08628">pdf</a>, <a href="https://arxiv.org/format/2409.08628">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Rhythmic Foley: A Framework For Seamless Audio-Visual Alignment In Video-to-Audio Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Huang%2C+Z">Zhiqi Huang</a>, <a href="/search/eess?searchtype=author&query=Luo%2C+D">Dan Luo</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+J">Jun Wang</a>, <a href="/search/eess?searchtype=author&query=Liao%2C+H">Huan Liao</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Z">Zhiheng Li</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+Z">Zhiyong Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.08628v1-abstract-short" style="display: inline;"> Our research introduces an innovative framework for video-to-audio synthesis, which solves the problems of audio-video desynchronization and semantic loss in the audio. By incorporating a semantic alignment adapter and a temporal synchronization adapter, our method significantly improves semantic integrity and the precision of beat point synchronization, particularly in fast-paced action sequences… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08628v1-abstract-full').style.display = 'inline'; document.getElementById('2409.08628v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.08628v1-abstract-full" style="display: none;"> Our research introduces an innovative framework for video-to-audio synthesis, which solves the problems of audio-video desynchronization and semantic loss in the audio. By incorporating a semantic alignment adapter and a temporal synchronization adapter, our method significantly improves semantic integrity and the precision of beat point synchronization, particularly in fast-paced action sequences. Utilizing a contrastive audio-visual pre-trained encoder, our model is trained with video and high-quality audio data, improving the quality of the generated audio. This dual-adapter approach empowers users with enhanced control over audio semantics and beat effects, allowing the adjustment of the controller to achieve better results. Extensive experiments substantiate the effectiveness of our framework in achieving seamless audio-visual alignment. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08628v1-abstract-full').style.display = 'none'; document.getElementById('2409.08628v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.08300">arXiv:2409.08300</a> <span> [<a href="https://arxiv.org/pdf/2409.08300">pdf</a>, <a href="https://arxiv.org/format/2409.08300">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Iterative Convex Optimization for Safety-Critical Model Predictive Control </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Liu%2C+S">Shuo Liu</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+Z">Zhe Huang</a>, <a href="/search/eess?searchtype=author&query=Zeng%2C+J">Jun Zeng</a>, <a href="/search/eess?searchtype=author&query=Sreenath%2C+K">Koushil Sreenath</a>, <a href="/search/eess?searchtype=author&query=Belta%2C+C+A">Calin A. Belta</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.08300v1-abstract-short" style="display: inline;"> Safety is one of the fundamental challenges in control theory. Recently, multi-step optimal control problems for discrete-time dynamical systems were developed to ensure stability, while adhering to input constraints and safety-critical requirements. This was achieved by incorporating discrete-time Control Barrier Functions (CBFs) within a Model Predictive Control (MPC) framework. Existing work us… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08300v1-abstract-full').style.display = 'inline'; document.getElementById('2409.08300v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.08300v1-abstract-full" style="display: none;"> Safety is one of the fundamental challenges in control theory. Recently, multi-step optimal control problems for discrete-time dynamical systems were developed to ensure stability, while adhering to input constraints and safety-critical requirements. This was achieved by incorporating discrete-time Control Barrier Functions (CBFs) within a Model Predictive Control (MPC) framework. Existing work usually centers on the feasibility or safety of optimization problems when the boundaries of safe sets are clearly defined. Most of this research limits discussions to CBFs with relative degree one with respect to the system dynamics. Furthermore, real-time computation becomes challenging in MPC problems with large horizons. In this paper, we introduce a framework that addresses the safety-critical MPC problem through iterative optimization, applicable across CBFs of any relative degree. Our approach involves linearizing the nonlinear system dynamics and safety constraints, modeled as Discrete-time High-Order CBFs (DHOCBFs), at each time step. Additionally, when the boundaries of the safe sets are complex, we present a learning-based method to develop linear boundary equations for these safe sets. These equations are then converted into linearized DHOCBFs. The benefits of computational performance and safe avoidance of obstacles with diverse shapes are examined and confirmed through numerical results. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08300v1-abstract-full').style.display = 'none'; document.getElementById('2409.08300v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages, 12 figures. arXiv admin note: text overlap with arXiv:2210.04361</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.01222">arXiv:2409.01222</a> <span> [<a href="https://arxiv.org/pdf/2409.01222">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Nonlinear PDE Constrained Optimal Dispatch of Gas and Power: A Global Linearization Approach </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yuan Li</a>, <a href="/search/eess?searchtype=author&query=Lu%2C+S">Shuai Lu</a>, <a href="/search/eess?searchtype=author&query=Gu%2C+W">Wei Gu</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+Y">Yijun Xu</a>, <a href="/search/eess?searchtype=author&query=Yu%2C+R">Ruizhi Yu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+S">Suhan Zhang</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+Z">Zhikai Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.01222v1-abstract-short" style="display: inline;"> The coordinated dispatch of power and gas in the electricity-gas integrated energy system (EG-IES) is fundamental for ensuring operational security. However, the gas dynamics in the natural gas system (NGS) are governed by the nonlinear partial differential equations (PDE), making the dispatch problem of the EG-IES a complicated optimization model constrained by nonlinear PDE. To address it, we pr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.01222v1-abstract-full').style.display = 'inline'; document.getElementById('2409.01222v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.01222v1-abstract-full" style="display: none;"> The coordinated dispatch of power and gas in the electricity-gas integrated energy system (EG-IES) is fundamental for ensuring operational security. However, the gas dynamics in the natural gas system (NGS) are governed by the nonlinear partial differential equations (PDE), making the dispatch problem of the EG-IES a complicated optimization model constrained by nonlinear PDE. To address it, we propose a globally linearized gas network model based on the Koopman operator theory, avoiding the commonly used local linearization and spatial discretization. Particularly, we propose a data-driven Koopman operator approximation approach for the globally linearized gas network model based on the extended dynamic mode decomposition, in which a physics-informed stability constraint is derived and embedded to improve the generalization ability and accuracy of the model. Based on this, we develop an optimal dispatch model for the EG-IES that first considers the nonlinear gas dynamics in the NGS. The case study verifies the effectiveness of this work. Simulation results reveal that the commonly used locally linearized gas network model fails to accurately capture the dynamic characteristics of NGS, bringing potential security threats to the system. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.01222v1-abstract-full').style.display = 'none'; document.getElementById('2409.01222v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.00078">arXiv:2409.00078</a> <span> [<a href="https://arxiv.org/pdf/2409.00078">pdf</a>, <a href="https://arxiv.org/format/2409.00078">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> SGP-RI: A Real-Time-Trainable and Decentralized IoT Indoor Localization Model Based on Sparse Gaussian Process with Reduced-Dimensional Inputs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Tang%2C+Z">Zhe Tang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+S">Sihao Li</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+Z">Zichen Huang</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+G">Guandong Yang</a>, <a href="/search/eess?searchtype=author&query=Kim%2C+K+S">Kyeong Soo Kim</a>, <a href="/search/eess?searchtype=author&query=Smith%2C+J+S">Jeremy S. Smith</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.00078v1-abstract-short" style="display: inline;"> Internet of Things (IoT) devices are deployed in the filed, there is an enormous amount of untapped potential in local computing on those IoT devices. Harnessing this potential for indoor localization, therefore, becomes an exciting research area. Conventionally, the training and deployment of indoor localization models are based on centralized servers with substantial computational resources. Thi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00078v1-abstract-full').style.display = 'inline'; document.getElementById('2409.00078v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.00078v1-abstract-full" style="display: none;"> Internet of Things (IoT) devices are deployed in the filed, there is an enormous amount of untapped potential in local computing on those IoT devices. Harnessing this potential for indoor localization, therefore, becomes an exciting research area. Conventionally, the training and deployment of indoor localization models are based on centralized servers with substantial computational resources. This centralized approach faces several challenges, including the database's inability to accommodate the dynamic and unpredictable nature of the indoor electromagnetic environment, the model retraining costs, and the susceptibility of centralized servers to security breaches. To mitigate these challenges we aim to amalgamate the offline and online phases of traditional indoor localization methods using a real-time-trainable and decentralized IoT indoor localization model based on Sparse Gaussian Process with Reduced-dimensional Inputs (SGP-RI), where the number and dimension of the input data are reduced through reference point and wireless access point filtering, respectively. The experimental results based on a multi-building and multi-floor static database as well as a single-building and single-floor dynamic database, demonstrate that the proposed SGP-RI model with less than half the training samples as inducing inputs can produce comparable localization performance to the standard Gaussian Process model with the whole training samples. The SGP-RI model enables the decentralization of indoor localization, facilitating its deployment to resource-constrained IoT devices, and thereby could provide enhanced security and privacy, reduced costs, and network dependency. Also, the model's capability of real-time training makes it possible to quickly adapt to the time-varying indoor electromagnetic environment. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00078v1-abstract-full').style.display = 'none'; document.getElementById('2409.00078v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 4 figures, under review for journal publication</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.12734">arXiv:2408.12734</a> <span> [<a href="https://arxiv.org/pdf/2408.12734">pdf</a>, <a href="https://arxiv.org/format/2408.12734">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Towards measuring fairness in speech recognition: Fair-Speech dataset </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Veliche%2C+I">Irina-Elena Veliche</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+Z">Zhuangqun Huang</a>, <a href="/search/eess?searchtype=author&query=Kochaniyan%2C+V+A">Vineeth Ayyat Kochaniyan</a>, <a href="/search/eess?searchtype=author&query=Peng%2C+F">Fuchun Peng</a>, <a href="/search/eess?searchtype=author&query=Kalinli%2C+O">Ozlem Kalinli</a>, <a href="/search/eess?searchtype=author&query=Seltzer%2C+M+L">Michael L. Seltzer</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.12734v1-abstract-short" style="display: inline;"> The current public datasets for speech recognition (ASR) tend not to focus specifically on the fairness aspect, such as performance across different demographic groups. This paper introduces a novel dataset, Fair-Speech, a publicly released corpus to help researchers evaluate their ASR models for accuracy across a diverse set of self-reported demographic information, such as age, gender, ethnicity… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12734v1-abstract-full').style.display = 'inline'; document.getElementById('2408.12734v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.12734v1-abstract-full" style="display: none;"> The current public datasets for speech recognition (ASR) tend not to focus specifically on the fairness aspect, such as performance across different demographic groups. This paper introduces a novel dataset, Fair-Speech, a publicly released corpus to help researchers evaluate their ASR models for accuracy across a diverse set of self-reported demographic information, such as age, gender, ethnicity, geographic variation and whether the participants consider themselves native English speakers. Our dataset includes approximately 26.5K utterances in recorded speech by 593 people in the United States, who were paid to record and submit audios of themselves saying voice commands. We also provide ASR baselines, including on models trained on transcribed and untranscribed social media videos and open source models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12734v1-abstract-full').style.display = 'none'; document.getElementById('2408.12734v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.12534">arXiv:2408.12534</a> <span> [<a href="https://arxiv.org/pdf/2408.12534">pdf</a>, <a href="https://arxiv.org/format/2408.12534">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Automatic Organ and Pan-cancer Segmentation in Abdomen CT: the FLARE 2023 Challenge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ma%2C+J">Jun Ma</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Y">Yao Zhang</a>, <a href="/search/eess?searchtype=author&query=Gu%2C+S">Song Gu</a>, <a href="/search/eess?searchtype=author&query=Ge%2C+C">Cheng Ge</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+E">Ershuai Wang</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+Q">Qin Zhou</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+Z">Ziyan Huang</a>, <a href="/search/eess?searchtype=author&query=Lyu%2C+P">Pengju Lyu</a>, <a href="/search/eess?searchtype=author&query=He%2C+J">Jian He</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+B">Bo Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.12534v1-abstract-short" style="display: inline;"> Organ and cancer segmentation in abdomen Computed Tomography (CT) scans is the prerequisite for precise cancer diagnosis and treatment. Most existing benchmarks and algorithms are tailored to specific cancer types, limiting their ability to provide comprehensive cancer analysis. This work presents the first international competition on abdominal organ and pan-cancer segmentation by providing a lar… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12534v1-abstract-full').style.display = 'inline'; document.getElementById('2408.12534v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.12534v1-abstract-full" style="display: none;"> Organ and cancer segmentation in abdomen Computed Tomography (CT) scans is the prerequisite for precise cancer diagnosis and treatment. Most existing benchmarks and algorithms are tailored to specific cancer types, limiting their ability to provide comprehensive cancer analysis. This work presents the first international competition on abdominal organ and pan-cancer segmentation by providing a large-scale and diverse dataset, including 4650 CT scans with various cancer types from over 40 medical centers. The winning team established a new state-of-the-art with a deep learning-based cascaded framework, achieving average Dice Similarity Coefficient scores of 92.3% for organs and 64.9% for lesions on the hidden multi-national testing set. The dataset and code of top teams are publicly available, offering a benchmark platform to drive further innovations https://codalab.lisn.upsaclay.fr/competitions/12239. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12534v1-abstract-full').style.display = 'none'; document.getElementById('2408.12534v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">MICCAI 2024 FLARE Challenge Summary</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.10410">arXiv:2408.10410</a> <span> [<a href="https://arxiv.org/pdf/2408.10410">pdf</a>, <a href="https://arxiv.org/format/2408.10410">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Stream-Based Ground Segmentation for Real-Time LiDAR Point Cloud Processing on FPGA </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+X">Xiao Zhang</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+Z">Zhanhong Huang</a>, <a href="/search/eess?searchtype=author&query=Antony%2C+G+G">Garcia Gonzalez Antony</a>, <a href="/search/eess?searchtype=author&query=Jachimczyk%2C+W">Witek Jachimczyk</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+X">Xinming Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.10410v1-abstract-short" style="display: inline;"> This paper presents a novel and fast approach for ground plane segmentation in a LiDAR point cloud, specifically optimized for processing speed and hardware efficiency on FPGA hardware platforms. Our approach leverages a channel-based segmentation method with an advanced angular data repair technique and a cross-eight-way flood-fill algorithm. This innovative approach significantly reduces the num… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10410v1-abstract-full').style.display = 'inline'; document.getElementById('2408.10410v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.10410v1-abstract-full" style="display: none;"> This paper presents a novel and fast approach for ground plane segmentation in a LiDAR point cloud, specifically optimized for processing speed and hardware efficiency on FPGA hardware platforms. Our approach leverages a channel-based segmentation method with an advanced angular data repair technique and a cross-eight-way flood-fill algorithm. This innovative approach significantly reduces the number of iterations while ensuring the high accuracy of the segmented ground plane, which makes the stream-based hardware implementation possible. To validate the proposed approach, we conducted extensive experiments on the SemanticKITTI dataset. We introduced a bird's-eye view (BEV) evaluation metric tailored for the area representation of LiDAR segmentation tasks. Our method demonstrated superior performance in terms of BEV areas when compared to the existing approaches. Moreover, we presented an optimized hardware architecture targeted on a Zynq-7000 FPGA, compatible with LiDARs of various channel densities, i.e., 32, 64, and 128 channels. Our FPGA implementation operating at 160 MHz significantly outperforms the traditional computing platforms, which is 12 to 25 times faster than the CPU-based solutions and up to 6 times faster than the GPU-based solution, in addition to the benefit of low power consumption. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10410v1-abstract-full').style.display = 'none'; document.getElementById('2408.10410v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.10404">arXiv:2408.10404</a> <span> [<a href="https://arxiv.org/pdf/2408.10404">pdf</a>, <a href="https://arxiv.org/format/2408.10404">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Accelerating Point Cloud Ground Segmentation: From Mechanical to Solid-State Lidars </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+X">Xiao Zhang</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+Z">Zhanhong Huang</a>, <a href="/search/eess?searchtype=author&query=Antony%2C+G+G">Garcia Gonzalez Antony</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+X">Xinming Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.10404v2-abstract-short" style="display: inline;"> In this study, we propose a novel parallel processing method for point cloud ground segmentation, aimed at the technology evolution from mechanical to solid-state Lidar (SSL). We first benchmark point-based, grid-based, and range image-based ground segmentation algorithms using the SemanticKITTI dataset. Our results indicate that the range image-based method offers superior performance and robustn… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10404v2-abstract-full').style.display = 'inline'; document.getElementById('2408.10404v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.10404v2-abstract-full" style="display: none;"> In this study, we propose a novel parallel processing method for point cloud ground segmentation, aimed at the technology evolution from mechanical to solid-state Lidar (SSL). We first benchmark point-based, grid-based, and range image-based ground segmentation algorithms using the SemanticKITTI dataset. Our results indicate that the range image-based method offers superior performance and robustness, particularly in resilience to frame slicing. Implementing the proposed algorithm on an FPGA demonstrates significant improvements in processing speed and scalability of resource usage. Additionally, we develop a custom dataset using camera-SSL equipment on our test vehicle to validate the effectiveness of the parallel processing approach for SSL frames in real world, achieving processing rates up to 30.9 times faster than CPU implementations. These findings underscore the potential of parallel processing strategies to enhance Lidar technologies for advanced perception tasks in autonomous vehicles and robotics. The data and code will be available post-publication on our GitHub repository: \url{https://github.com/WPI-APA-Lab/GroundSeg-Solid-State-Lidar-Parallel-Processing} <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10404v2-abstract-full').style.display = 'none'; document.getElementById('2408.10404v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">6 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.08796">arXiv:2408.08796</a> <span> [<a href="https://arxiv.org/pdf/2408.08796">pdf</a>, <a href="https://arxiv.org/ps/2408.08796">ps</a>, <a href="https://arxiv.org/format/2408.08796">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Multi-Antenna Broadband Backscatter Communications </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Chen%2C+H">Hao Chen</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+Z">Zhizhi Huang</a>, <a href="/search/eess?searchtype=author&query=Liang%2C+Y">Ying-Chang Liang</a>, <a href="/search/eess?searchtype=author&query=Schober%2C+R">Robert Schober</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.08796v1-abstract-short" style="display: inline;"> Backscatter communication offers a promising solution to connect massive Internet-of-Things (IoT) devices with low cost and high energy efficiency. Nevertheless, its inherently passive nature limits transmission reliability, thereby hindering improvements in communication range and data rate. To overcome these challenges, we introduce a bistatic broadband backscatter communication (BBBC) system, w… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.08796v1-abstract-full').style.display = 'inline'; document.getElementById('2408.08796v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.08796v1-abstract-full" style="display: none;"> Backscatter communication offers a promising solution to connect massive Internet-of-Things (IoT) devices with low cost and high energy efficiency. Nevertheless, its inherently passive nature limits transmission reliability, thereby hindering improvements in communication range and data rate. To overcome these challenges, we introduce a bistatic broadband backscatter communication (BBBC) system, which equips the backscatter device (BD) with multiple antennas. In the proposed BBBC system, a radio frequency (RF) source directs a sinusoidal signal to the BD, facilitating single-carrier block transmission at the BD. Meanwhile, without requiring channel state information (CSI), cyclic delay diversity (CDD) is employed at the multi-antenna BD to enhance transmission reliability through additional cyclically delayed backscattered signals. We also propose a receiver design that includes preprocessing of the time-domain received signal, pilot-based parameter estimation, and frequency-domain equalization, enabling low-complexity detection of the backscattered signal. Leveraging the matched filter bound (MFB), we analyze the achievable diversity gains in terms of outage probability. Our analysis reveals that spatial diversity is achievable under general Rayleigh fading conditions, and both frequency and spatial diversity are attainable in scenarios where the forward link experiences a line-of-sight (LoS) channel. Simulation results validate the effectiveness of the proposed BBBC system. As the number of BD antennas increases, our results show that the proposed scheme not only enhances array gain but also improves diversity order, significantly reducing both outage probability and bit error rate (BER). Consequently, it outperforms conventional schemes that yield only minor gains. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.08796v1-abstract-full').style.display = 'none'; document.getElementById('2408.08796v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.03361">arXiv:2408.03361</a> <span> [<a href="https://arxiv.org/pdf/2408.03361">pdf</a>, <a href="https://arxiv.org/format/2408.03361">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> GMAI-MMBench: A Comprehensive Multimodal Evaluation Benchmark Towards General Medical AI </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Chen%2C+P">Pengcheng Chen</a>, <a href="/search/eess?searchtype=author&query=Ye%2C+J">Jin Ye</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+G">Guoan Wang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yanjun Li</a>, <a href="/search/eess?searchtype=author&query=Deng%2C+Z">Zhongying Deng</a>, <a href="/search/eess?searchtype=author&query=Li%2C+W">Wei Li</a>, <a href="/search/eess?searchtype=author&query=Li%2C+T">Tianbin Li</a>, <a href="/search/eess?searchtype=author&query=Duan%2C+H">Haodong Duan</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+Z">Ziyan Huang</a>, <a href="/search/eess?searchtype=author&query=Su%2C+Y">Yanzhou Su</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+B">Benyou Wang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+S">Shaoting Zhang</a>, <a href="/search/eess?searchtype=author&query=Fu%2C+B">Bin Fu</a>, <a href="/search/eess?searchtype=author&query=Cai%2C+J">Jianfei Cai</a>, <a href="/search/eess?searchtype=author&query=Zhuang%2C+B">Bohan Zhuang</a>, <a href="/search/eess?searchtype=author&query=Seibel%2C+E+J">Eric J Seibel</a>, <a href="/search/eess?searchtype=author&query=He%2C+J">Junjun He</a>, <a href="/search/eess?searchtype=author&query=Qiao%2C+Y">Yu Qiao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.03361v7-abstract-short" style="display: inline;"> Large Vision-Language Models (LVLMs) are capable of handling diverse data types such as imaging, text, and physiological signals, and can be applied in various fields. In the medical field, LVLMs have a high potential to offer substantial assistance for diagnosis and treatment. Before that, it is crucial to develop benchmarks to evaluate LVLMs' effectiveness in various medical applications. Curren… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.03361v7-abstract-full').style.display = 'inline'; document.getElementById('2408.03361v7-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.03361v7-abstract-full" style="display: none;"> Large Vision-Language Models (LVLMs) are capable of handling diverse data types such as imaging, text, and physiological signals, and can be applied in various fields. In the medical field, LVLMs have a high potential to offer substantial assistance for diagnosis and treatment. Before that, it is crucial to develop benchmarks to evaluate LVLMs' effectiveness in various medical applications. Current benchmarks are often built upon specific academic literature, mainly focusing on a single domain, and lacking varying perceptual granularities. Thus, they face specific challenges, including limited clinical relevance, incomplete evaluations, and insufficient guidance for interactive LVLMs. To address these limitations, we developed the GMAI-MMBench, the most comprehensive general medical AI benchmark with well-categorized data structure and multi-perceptual granularity to date. It is constructed from 284 datasets across 38 medical image modalities, 18 clinical-related tasks, 18 departments, and 4 perceptual granularities in a Visual Question Answering (VQA) format. Additionally, we implemented a lexical tree structure that allows users to customize evaluation tasks, accommodating various assessment needs and substantially supporting medical AI research and applications. We evaluated 50 LVLMs, and the results show that even the advanced GPT-4o only achieves an accuracy of 53.96%, indicating significant room for improvement. Moreover, we identified five key insufficiencies in current cutting-edge LVLMs that need to be addressed to advance the development of better medical applications. We believe that GMAI-MMBench will stimulate the community to build the next generation of LVLMs toward GMAI. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.03361v7-abstract-full').style.display = 'none'; document.getElementById('2408.03361v7-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">GitHub: https://github.com/uni-medical/GMAI-MMBench Hugging face: https://huggingface.co/datasets/OpenGVLab/GMAI-MMBench</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.01292">arXiv:2408.01292</a> <span> [<a href="https://arxiv.org/pdf/2408.01292">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> 3DPX: Progressive 2D-to-3D Oral Image Reconstruction with Hybrid MLP-CNN Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+X">Xiaoshuang Li</a>, <a href="/search/eess?searchtype=author&query=Meng%2C+M">Mingyuan Meng</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+Z">Zimo Huang</a>, <a href="/search/eess?searchtype=author&query=Bi%2C+L">Lei Bi</a>, <a href="/search/eess?searchtype=author&query=Delamare%2C+E">Eduardo Delamare</a>, <a href="/search/eess?searchtype=author&query=Feng%2C+D">Dagan Feng</a>, <a href="/search/eess?searchtype=author&query=Sheng%2C+B">Bin Sheng</a>, <a href="/search/eess?searchtype=author&query=Kim%2C+J">Jinman Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.01292v1-abstract-short" style="display: inline;"> Panoramic X-ray (PX) is a prevalent modality in dental practice for its wide availability and low cost. However, as a 2D projection image, PX does not contain 3D anatomical information, and therefore has limited use in dental applications that can benefit from 3D information, e.g., tooth angular misa-lignment detection and classification. Reconstructing 3D structures directly from 2D PX has recent… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.01292v1-abstract-full').style.display = 'inline'; document.getElementById('2408.01292v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.01292v1-abstract-full" style="display: none;"> Panoramic X-ray (PX) is a prevalent modality in dental practice for its wide availability and low cost. However, as a 2D projection image, PX does not contain 3D anatomical information, and therefore has limited use in dental applications that can benefit from 3D information, e.g., tooth angular misa-lignment detection and classification. Reconstructing 3D structures directly from 2D PX has recently been explored to address limitations with existing methods primarily reliant on Convolutional Neural Networks (CNNs) for direct 2D-to-3D mapping. These methods, however, are unable to correctly infer depth-axis spatial information. In addition, they are limited by the in-trinsic locality of convolution operations, as the convolution kernels only capture the information of immediate neighborhood pixels. In this study, we propose a progressive hybrid Multilayer Perceptron (MLP)-CNN pyra-mid network (3DPX) for 2D-to-3D oral PX reconstruction. We introduce a progressive reconstruction strategy, where 3D images are progressively re-constructed in the 3DPX with guidance imposed on the intermediate recon-struction result at each pyramid level. Further, motivated by the recent ad-vancement of MLPs that show promise in capturing fine-grained long-range dependency, our 3DPX integrates MLPs and CNNs to improve the semantic understanding during reconstruction. Extensive experiments on two large datasets involving 464 studies demonstrate that our 3DPX outperforms state-of-the-art 2D-to-3D oral reconstruction methods, including standalone MLP and transformers, in reconstruction quality, and also im-proves the performance of downstream angular misalignment classification tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.01292v1-abstract-full').style.display = 'none'; document.getElementById('2408.01292v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted by MICCAI 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.21646">arXiv:2407.21646</a> <span> [<a href="https://arxiv.org/pdf/2407.21646">pdf</a>, <a href="https://arxiv.org/format/2407.21646">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Towards Achieving Human Parity on End-to-end Simultaneous Speech Translation via LLM Agent </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Cheng%2C+S">Shanbo Cheng</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+Z">Zhichao Huang</a>, <a href="/search/eess?searchtype=author&query=Ko%2C+T">Tom Ko</a>, <a href="/search/eess?searchtype=author&query=Li%2C+H">Hang Li</a>, <a href="/search/eess?searchtype=author&query=Peng%2C+N">Ningxin Peng</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+L">Lu Xu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Q">Qini Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.21646v2-abstract-short" style="display: inline;"> In this paper, we present Cross Language Agent -- Simultaneous Interpretation, CLASI, a high-quality and human-like Simultaneous Speech Translation (SiST) System. Inspired by professional human interpreters, we utilize a novel data-driven read-write strategy to balance the translation quality and latency. To address the challenge of translating in-domain terminologies, CLASI employs a multi-modal… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.21646v2-abstract-full').style.display = 'inline'; document.getElementById('2407.21646v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.21646v2-abstract-full" style="display: none;"> In this paper, we present Cross Language Agent -- Simultaneous Interpretation, CLASI, a high-quality and human-like Simultaneous Speech Translation (SiST) System. Inspired by professional human interpreters, we utilize a novel data-driven read-write strategy to balance the translation quality and latency. To address the challenge of translating in-domain terminologies, CLASI employs a multi-modal retrieving module to obtain relevant information to augment the translation. Supported by LLMs, our approach can generate error-tolerated translation by considering the input audio, historical context, and retrieved information. Experimental results show that our system outperforms other systems by significant margins. Aligned with professional human interpreters, we evaluate CLASI with a better human evaluation metric, valid information proportion (VIP), which measures the amount of information that can be successfully conveyed to the listeners. In the real-world scenarios, where the speeches are often disfluent, informal, and unclear, CLASI achieves VIP of 81.3% and 78.0% for Chinese-to-English and English-to-Chinese translation directions, respectively. In contrast, state-of-the-art commercial or open-source systems only achieve 35.4% and 41.6%. On the extremely hard dataset, where other systems achieve under 13% VIP, CLASI can still achieve 70% VIP. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.21646v2-abstract-full').style.display = 'none'; document.getElementById('2407.21646v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Authors are listed in alphabetical order by last name. Demonstrations and human-annotated test sets are available at https://byteresearchcla.github.io/clasi</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.21453">arXiv:2407.21453</a> <span> [<a href="https://arxiv.org/pdf/2407.21453">pdf</a>, <a href="https://arxiv.org/format/2407.21453">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> TinyChirp: Bird Song Recognition Using TinyML Models on Low-power Wireless Acoustic Sensors </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Huang%2C+Z">Zhaolan Huang</a>, <a href="/search/eess?searchtype=author&query=Tousnakhoff%2C+A">Adrien Tousnakhoff</a>, <a href="/search/eess?searchtype=author&query=Kozyr%2C+P">Polina Kozyr</a>, <a href="/search/eess?searchtype=author&query=Rehausen%2C+R">Roman Rehausen</a>, <a href="/search/eess?searchtype=author&query=Bie%C3%9Fmann%2C+F">Felix Bie脽mann</a>, <a href="/search/eess?searchtype=author&query=Lachlan%2C+R">Robert Lachlan</a>, <a href="/search/eess?searchtype=author&query=Adjih%2C+C">Cedric Adjih</a>, <a href="/search/eess?searchtype=author&query=Baccelli%2C+E">Emmanuel Baccelli</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.21453v2-abstract-short" style="display: inline;"> Monitoring biodiversity at scale is challenging. Detecting and identifying species in fine grained taxonomies requires highly accurate machine learning (ML) methods. Training such models requires large high quality data sets. And deploying these models to low power devices requires novel compression techniques and model architectures. While species classification methods have profited from novel d… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.21453v2-abstract-full').style.display = 'inline'; document.getElementById('2407.21453v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.21453v2-abstract-full" style="display: none;"> Monitoring biodiversity at scale is challenging. Detecting and identifying species in fine grained taxonomies requires highly accurate machine learning (ML) methods. Training such models requires large high quality data sets. And deploying these models to low power devices requires novel compression techniques and model architectures. While species classification methods have profited from novel data sets and advances in ML methods, in particular neural networks, deploying these state of the art models to low power devices remains difficult. Here we present a comprehensive empirical comparison of various tinyML neural network architectures and compression techniques for species classification. We focus on the example of bird song detection, more concretely a data set curated for studying the corn bunting bird species. The data set is released along with all code and experiments of this study. In our experiments we compare predictive performance, memory and time complexity of classical spectrogram based methods and recent approaches operating on raw audio signal. Our results indicate that individual bird species can be robustly detected with relatively simple architectures that can be readily deployed to low power devices. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.21453v2-abstract-full').style.display = 'none'; document.getElementById('2407.21453v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.19436">arXiv:2407.19436</a> <span> [<a href="https://arxiv.org/pdf/2407.19436">pdf</a>, <a href="https://arxiv.org/format/2407.19436">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> X-Fake: Juggling Utility Evaluation and Explanation of Simulated SAR Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Huang%2C+Z">Zhongling Huang</a>, <a href="/search/eess?searchtype=author&query=Zhuang%2C+Y">Yihan Zhuang</a>, <a href="/search/eess?searchtype=author&query=Zhong%2C+Z">Zipei Zhong</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+F">Feng Xu</a>, <a href="/search/eess?searchtype=author&query=Cheng%2C+G">Gong Cheng</a>, <a href="/search/eess?searchtype=author&query=Han%2C+J">Junwei Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.19436v1-abstract-short" style="display: inline;"> SAR image simulation has attracted much attention due to its great potential to supplement the scarce training data for deep learning algorithms. Consequently, evaluating the quality of the simulated SAR image is crucial for practical applications. The current literature primarily uses image quality assessment techniques for evaluation that rely on human observers' perceptions. However, because of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.19436v1-abstract-full').style.display = 'inline'; document.getElementById('2407.19436v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.19436v1-abstract-full" style="display: none;"> SAR image simulation has attracted much attention due to its great potential to supplement the scarce training data for deep learning algorithms. Consequently, evaluating the quality of the simulated SAR image is crucial for practical applications. The current literature primarily uses image quality assessment techniques for evaluation that rely on human observers' perceptions. However, because of the unique imaging mechanism of SAR, these techniques may produce evaluation results that are not entirely valid. The distribution inconsistency between real and simulated data is the main obstacle that influences the utility of simulated SAR images. To this end, we propose a novel trustworthy utility evaluation framework with a counterfactual explanation for simulated SAR images for the first time, denoted as X-Fake. It unifies a probabilistic evaluator and a causal explainer to achieve a trustworthy utility assessment. We construct the evaluator using a probabilistic Bayesian deep model to learn the posterior distribution, conditioned on real data. Quantitatively, the predicted uncertainty of simulated data can reflect the distribution discrepancy. We build the causal explainer with an introspective variational auto-encoder to generate high-resolution counterfactuals. The latent code of IntroVAE is finally optimized with evaluation indicators and prior information to generate the counterfactual explanation, thus revealing the inauthentic details of simulated data explicitly. The proposed framework is validated on four simulated SAR image datasets obtained from electromagnetic models and generative artificial intelligence approaches. The results demonstrate the proposed X-Fake framework outperforms other IQA methods in terms of utility. Furthermore, the results illustrate that the generated counterfactual explanations are trustworthy, and can further improve the data utility in applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.19436v1-abstract-full').style.display = 'none'; document.getElementById('2407.19436v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.11031">arXiv:2407.11031</a> <span> [<a href="https://arxiv.org/pdf/2407.11031">pdf</a>, <a href="https://arxiv.org/format/2407.11031">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Purification Of Contaminated Convolutional Neural Networks Via Robust Recovery: An Approach with Theoretical Guarantee in One-Hidden-Layer Case </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Lu%2C+H">Hanxiao Lu</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+Z">Zeyu Huang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+R">Ren Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.11031v1-abstract-short" style="display: inline;"> Convolutional neural networks (CNNs), one of the key architectures of deep learning models, have achieved superior performance on many machine learning tasks such as image classification, video recognition, and power systems. Despite their success, CNNs can be easily contaminated by natural noises and artificially injected noises such as backdoor attacks. In this paper, we propose a robust recover… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.11031v1-abstract-full').style.display = 'inline'; document.getElementById('2407.11031v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.11031v1-abstract-full" style="display: none;"> Convolutional neural networks (CNNs), one of the key architectures of deep learning models, have achieved superior performance on many machine learning tasks such as image classification, video recognition, and power systems. Despite their success, CNNs can be easily contaminated by natural noises and artificially injected noises such as backdoor attacks. In this paper, we propose a robust recovery method to remove the noise from the potentially contaminated CNNs and provide an exact recovery guarantee on one-hidden-layer non-overlapping CNNs with the rectified linear unit (ReLU) activation function. Our theoretical results show that both CNNs' weights and biases can be exactly recovered under the overparameterization setting with some mild assumptions. The experimental results demonstrate the correctness of the proofs and the effectiveness of the method in both the synthetic environment and the practical neural network setting. Our results also indicate that the proposed method can be extended to multiple-layer CNNs and potentially serve as a defense strategy against backdoor attacks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.11031v1-abstract-full').style.display = 'none'; document.getElementById('2407.11031v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.07554">arXiv:2407.07554</a> <span> [<a href="https://arxiv.org/pdf/2407.07554">pdf</a>, <a href="https://arxiv.org/format/2407.07554">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Beat-It: Beat-Synchronized Multi-Condition 3D Dance Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Huang%2C+Z">Zikai Huang</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+X">Xuemiao Xu</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+C">Cheng Xu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+H">Huaidong Zhang</a>, <a href="/search/eess?searchtype=author&query=Zheng%2C+C">Chenxi Zheng</a>, <a href="/search/eess?searchtype=author&query=Qin%2C+J">Jing Qin</a>, <a href="/search/eess?searchtype=author&query=He%2C+S">Shengfeng He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.07554v1-abstract-short" style="display: inline;"> Dance, as an art form, fundamentally hinges on the precise synchronization with musical beats. However, achieving aesthetically pleasing dance sequences from music is challenging, with existing methods often falling short in controllability and beat alignment. To address these shortcomings, this paper introduces Beat-It, a novel framework for beat-specific, key pose-guided dance generation. Unlike… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.07554v1-abstract-full').style.display = 'inline'; document.getElementById('2407.07554v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.07554v1-abstract-full" style="display: none;"> Dance, as an art form, fundamentally hinges on the precise synchronization with musical beats. However, achieving aesthetically pleasing dance sequences from music is challenging, with existing methods often falling short in controllability and beat alignment. To address these shortcomings, this paper introduces Beat-It, a novel framework for beat-specific, key pose-guided dance generation. Unlike prior approaches, Beat-It uniquely integrates explicit beat awareness and key pose guidance, effectively resolving two main issues: the misalignment of generated dance motions with musical beats, and the inability to map key poses to specific beats, critical for practical choreography. Our approach disentangles beat conditions from music using a nearest beat distance representation and employs a hierarchical multi-condition fusion mechanism. This mechanism seamlessly integrates key poses, beats, and music features, mitigating condition conflicts and offering rich, multi-conditioned guidance for dance generation. Additionally, a specially designed beat alignment loss ensures the generated dance movements remain in sync with the designated beats. Extensive experiments confirm Beat-It's superiority over existing state-of-the-art methods in terms of beat alignment and motion controllability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.07554v1-abstract-full').style.display = 'none'; document.getElementById('2407.07554v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ECCV 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.06176">arXiv:2407.06176</a> <span> [<a href="https://arxiv.org/pdf/2407.06176">pdf</a>, <a href="https://arxiv.org/format/2407.06176">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Contour-weighted loss for class-imbalanced image segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Huang%2C+Z">Zhhengyong Huang</a>, <a href="/search/eess?searchtype=author&query=Sui%2C+Y">Yao Sui</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.06176v1-abstract-short" style="display: inline;"> Image segmentation is critically important in almost all medical image analysis for automatic interpretations and processing. However, it is often challenging to perform image segmentation due to data imbalance between intra- and inter-class, resulting in over- or under-segmentation. Consequently, we proposed a new methodology to address the above issue, with a compact yet effective contour-weight… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.06176v1-abstract-full').style.display = 'inline'; document.getElementById('2407.06176v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.06176v1-abstract-full" style="display: none;"> Image segmentation is critically important in almost all medical image analysis for automatic interpretations and processing. However, it is often challenging to perform image segmentation due to data imbalance between intra- and inter-class, resulting in over- or under-segmentation. Consequently, we proposed a new methodology to address the above issue, with a compact yet effective contour-weighted loss function. Our new loss function incorporates a contour-weighted cross-entropy loss and separable dice loss. The former loss extracts the contour of target regions via morphological erosion and generates a weight map for the cross-entropy criterion, whereas the latter divides the target regions into contour and non-contour components through the extracted contour map, calculates dice loss separately, and combines them to update the network. We carried out abdominal organ segmentation and brain tumor segmentation on two public datasets to assess our approach. Experimental results demonstrated that our approach offered superior segmentation, as compared to several state-of-the-art methods, while in parallel improving the robustness of those popular state-of-the-art deep models through our new loss function. The code is available at https://github.com/huangzyong/Contour-weighted-Loss-Seg. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.06176v1-abstract-full').style.display = 'none'; document.getElementById('2407.06176v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICIP 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.02124">arXiv:2407.02124</a> <span> [<a href="https://arxiv.org/pdf/2407.02124">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.17775/CSEEJPES.2024.04000">10.17775/CSEEJPES.2024.04000 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Data-Driven Subsynchronous Oscillation Suppression for Renewable Energy Integrated Power Systems Based on Koopman Operator </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+Z">Zihan Wang</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+Z">Ziyang Huang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+X">Xiaonan Zhang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+G">Gengyin Li</a>, <a href="/search/eess?searchtype=author&query=Zheng%2C+L">Le Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.02124v1-abstract-short" style="display: inline;"> Recently, subsynchronous oscillations (SSOs) have emerged frequently worldwide, with the high penetration of renewable power generation in modern power systems. The SSO introduced by renewables has become a prominent new stability problem, seriously threatening the stable operation of systems. This paper proposes a data-driven dynamic optimal controller for renewable energy integrated power system… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.02124v1-abstract-full').style.display = 'inline'; document.getElementById('2407.02124v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.02124v1-abstract-full" style="display: none;"> Recently, subsynchronous oscillations (SSOs) have emerged frequently worldwide, with the high penetration of renewable power generation in modern power systems. The SSO introduced by renewables has become a prominent new stability problem, seriously threatening the stable operation of systems. This paper proposes a data-driven dynamic optimal controller for renewable energy integrated power systems, to suppress SSOs with the control of renewables. The challenges of the controller design are the nonlinearity, complexity and hard accessibility of the system models. Using Koopman operator, the system dynamics are accurately extracted from data and utilized to the linear model predictive control (MPC). Firstly, the globally linear representation of the system dynamics is obtained by lifting, and the key states are selected as control signals by analyzing Koopman participation factors. Subsequently, augmented with the control term, the Koopman linear parameter-varying predictor of the controlled system is constructed. Finally, using MPC, the proposed controller computes control signals online in a moving horizon fashion. Case studies show that the proposed controller is effective, adaptive and robust in various conditions, surpassing other controllers with reliable control performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.02124v1-abstract-full').style.display = 'none'; document.getElementById('2407.02124v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> CSEE Journal of Power and Energy Systems (2024) </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.19072">arXiv:2406.19072</a> <span> [<a href="https://arxiv.org/pdf/2406.19072">pdf</a>, <a href="https://arxiv.org/format/2406.19072">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Scatterer Recognition for Multi-Modal Intelligent Vehicular Channel Modeling via Synesthesia of Machines </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Huang%2C+Z">Ziwei Huang</a>, <a href="/search/eess?searchtype=author&query=Bai%2C+L">Lu Bai</a>, <a href="/search/eess?searchtype=author&query=Han%2C+Z">Zengrui Han</a>, <a href="/search/eess?searchtype=author&query=Cheng%2C+X">Xiang Cheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.19072v2-abstract-short" style="display: inline;"> In this paper, a novel multi-modal intelligent vehicular channel model is proposed by scatterer recognition from light detection and ranging (LiDAR) point clouds via Synesthesia of Machines (SoM). The proposed model can support the design of intelligent transportation systems (ITSs). To provide a robust data foundation, a new intelligent sensing-communication integration dataset in vehicular urban… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.19072v2-abstract-full').style.display = 'inline'; document.getElementById('2406.19072v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.19072v2-abstract-full" style="display: none;"> In this paper, a novel multi-modal intelligent vehicular channel model is proposed by scatterer recognition from light detection and ranging (LiDAR) point clouds via Synesthesia of Machines (SoM). The proposed model can support the design of intelligent transportation systems (ITSs). To provide a robust data foundation, a new intelligent sensing-communication integration dataset in vehicular urban scenarios is constructed. Based on the constructed dataset, the complex SoM mechanism, i.e., mapping relationship between scatterers in electromagnetic space and LiDAR point clouds in physical environment, is explored via multilayer perceptron (MLP) in consideration of electromagnetic propagation mechanism. By using LiDAR point clouds to implement scatterer recognition, channel non-stationarity and consistency are captured closely coupled with the environment. Using ray-tracing (RT)-based results as the ground truth, the scatterer recognition accuracy exceeds 90%. The accuracy of the proposed model is further verified by the close fit between simulation results and RT results. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.19072v2-abstract-full').style.display = 'none'; document.getElementById('2406.19072v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.15222">arXiv:2406.15222</a> <span> [<a href="https://arxiv.org/pdf/2406.15222">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Rapid and Accurate Diagnosis of Acute Aortic Syndrome using Non-contrast CT: A Large-scale, Retrospective, Multi-center and AI-based Study </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Hu%2C+Y">Yujian Hu</a>, <a href="/search/eess?searchtype=author&query=Xiang%2C+Y">Yilang Xiang</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+Y">Yan-Jie Zhou</a>, <a href="/search/eess?searchtype=author&query=He%2C+Y">Yangyan He</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+S">Shifeng Yang</a>, <a href="/search/eess?searchtype=author&query=Du%2C+X">Xiaolong Du</a>, <a href="/search/eess?searchtype=author&query=Den%2C+C">Chunlan Den</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+Y">Youyao Xu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+G">Gaofeng Wang</a>, <a href="/search/eess?searchtype=author&query=Ding%2C+Z">Zhengyao Ding</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+J">Jingyong Huang</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+W">Wenjun Zhao</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+X">Xuejun Wu</a>, <a href="/search/eess?searchtype=author&query=Li%2C+D">Donglin Li</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+Q">Qianqian Zhu</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Z">Zhenjiang Li</a>, <a href="/search/eess?searchtype=author&query=Qiu%2C+C">Chenyang Qiu</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+Z">Ziheng Wu</a>, <a href="/search/eess?searchtype=author&query=He%2C+Y">Yunjun He</a>, <a href="/search/eess?searchtype=author&query=Tian%2C+C">Chen Tian</a>, <a href="/search/eess?searchtype=author&query=Qiu%2C+Y">Yihui Qiu</a>, <a href="/search/eess?searchtype=author&query=Lin%2C+Z">Zuodong Lin</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+X">Xiaolong Zhang</a>, <a href="/search/eess?searchtype=author&query=He%2C+Y">Yuan He</a>, <a href="/search/eess?searchtype=author&query=Yuan%2C+Z">Zhenpeng Yuan</a> , et al. (15 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.15222v3-abstract-short" style="display: inline;"> Chest pain symptoms are highly prevalent in emergency departments (EDs), where acute aortic syndrome (AAS) is a catastrophic cardiovascular emergency with a high fatality rate, especially when timely and accurate treatment is not administered. However, current triage practices in the ED can cause up to approximately half of patients with AAS to have an initially missed diagnosis or be misdiagnosed… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.15222v3-abstract-full').style.display = 'inline'; document.getElementById('2406.15222v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.15222v3-abstract-full" style="display: none;"> Chest pain symptoms are highly prevalent in emergency departments (EDs), where acute aortic syndrome (AAS) is a catastrophic cardiovascular emergency with a high fatality rate, especially when timely and accurate treatment is not administered. However, current triage practices in the ED can cause up to approximately half of patients with AAS to have an initially missed diagnosis or be misdiagnosed as having other acute chest pain conditions. Subsequently, these AAS patients will undergo clinically inaccurate or suboptimal differential diagnosis. Fortunately, even under these suboptimal protocols, nearly all these patients underwent non-contrast CT covering the aorta anatomy at the early stage of differential diagnosis. In this study, we developed an artificial intelligence model (DeepAAS) using non-contrast CT, which is highly accurate for identifying AAS and provides interpretable results to assist in clinical decision-making. Performance was assessed in two major phases: a multi-center retrospective study (n = 20,750) and an exploration in real-world emergency scenarios (n = 137,525). In the multi-center cohort, DeepAAS achieved a mean area under the receiver operating characteristic curve of 0.958 (95% CI 0.950-0.967). In the real-world cohort, DeepAAS detected 109 AAS patients with misguided initial suspicion, achieving 92.6% (95% CI 76.2%-97.5%) in mean sensitivity and 99.2% (95% CI 99.1%-99.3%) in mean specificity. Our AI model performed well on non-contrast CT at all applicable early stages of differential diagnosis workflows, effectively reduced the overall missed diagnosis and misdiagnosis rate from 48.8% to 4.8% and shortened the diagnosis time for patients with misguided initial suspicion from an average of 681.8 (74-11,820) mins to 68.5 (23-195) mins. DeepAAS could effectively fill the gap in the current clinical workflow without requiring additional tests. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.15222v3-abstract-full').style.display = 'none'; document.getElementById('2406.15222v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.14931">arXiv:2406.14931</a> <span> [<a href="https://arxiv.org/pdf/2406.14931">pdf</a>, <a href="https://arxiv.org/format/2406.14931">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Multi-beam Training for Near-field Communications in High-frequency Bands </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhou%2C+C">Cong Zhou</a>, <a href="/search/eess?searchtype=author&query=You%2C+C">Changsheng You</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+Z">Zixuan Huang</a>, <a href="/search/eess?searchtype=author&query=Shi%2C+S">Shuo Shi</a>, <a href="/search/eess?searchtype=author&query=Gong%2C+Y">Yi Gong</a>, <a href="/search/eess?searchtype=author&query=Chae%2C+C">Chan-Byoung Chae</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+K">Kaibin Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.14931v1-abstract-short" style="display: inline;"> In this paper, we study efficient multi-beam training design for near-field communications to reduce the beam training overhead of conventional single-beam training methods. In particular, the array-division based multi-beam training method, which is widely used in far-field communications, cannot be directly applied to the near-field scenario, since different sub-arrays may observe different user… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.14931v1-abstract-full').style.display = 'inline'; document.getElementById('2406.14931v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.14931v1-abstract-full" style="display: none;"> In this paper, we study efficient multi-beam training design for near-field communications to reduce the beam training overhead of conventional single-beam training methods. In particular, the array-division based multi-beam training method, which is widely used in far-field communications, cannot be directly applied to the near-field scenario, since different sub-arrays may observe different user angles and there exist coverage holes in the angular domain. To address these issues, we first devise a new near-field multi-beam codebook by sparsely activating a portion of antennas to form a sparse linear array (SLA), hence generating multiple beams simultaneously by effective exploiting the near-field grating-lobs. Next, a two-stage near-field beam training method is proposed, for which several candidate user locations are identified firstly based on multi-beam sweeping over time, followed by the second stage to further determine the true user location with a small number of single-beam sweeping. Finally, numerical results show that our proposed multi-beam training method significantly reduces the beam training overhead of conventional single-beam training methods, yet achieving comparable rate performance in data transmission. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.14931v1-abstract-full').style.display = 'none'; document.getElementById('2406.14931v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">In this paper, a novel near-field multi-beam training scheme is proposed by sparsely activating a portion of antennas to form a sparse linear array</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.14878">arXiv:2406.14878</a> <span> [<a href="https://arxiv.org/pdf/2406.14878">pdf</a>, <a href="https://arxiv.org/format/2406.14878">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> MOS: Model Synergy for Test-Time Adaptation on LiDAR-Based 3D Object Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Chen%2C+Z">Zhuoxiao Chen</a>, <a href="/search/eess?searchtype=author&query=Meng%2C+J">Junjie Meng</a>, <a href="/search/eess?searchtype=author&query=Baktashmotlagh%2C+M">Mahsa Baktashmotlagh</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Y">Yonggang Zhang</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+Z">Zi Huang</a>, <a href="/search/eess?searchtype=author&query=Luo%2C+Y">Yadan Luo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.14878v3-abstract-short" style="display: inline;"> LiDAR-based 3D object detection is crucial for various applications but often experiences performance degradation in real-world deployments due to domain shifts. While most studies focus on cross-dataset shifts, such as changes in environments and object geometries, practical corruptions from sensor variations and weather conditions remain underexplored. In this work, we propose a novel online tes… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.14878v3-abstract-full').style.display = 'inline'; document.getElementById('2406.14878v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.14878v3-abstract-full" style="display: none;"> LiDAR-based 3D object detection is crucial for various applications but often experiences performance degradation in real-world deployments due to domain shifts. While most studies focus on cross-dataset shifts, such as changes in environments and object geometries, practical corruptions from sensor variations and weather conditions remain underexplored. In this work, we propose a novel online test-time adaptation framework for 3D detectors that effectively tackles these shifts, including a challenging cross-corruption scenario where cross-dataset shifts and corruptions co-occur. By leveraging long-term knowledge from previous test batches, our approach mitigates catastrophic forgetting and adapts effectively to diverse shifts. Specifically, we propose a Model Synergy (MOS) strategy that dynamically selects historical checkpoints with diverse knowledge and assembles them to best accommodate the current test batch. This assembly is directed by our proposed Synergy Weights (SW), which perform a weighted averaging of the selected checkpoints, minimizing redundancy in the composite model. The SWs are computed by evaluating the similarity of predicted bounding boxes on the test data and the independence of features between checkpoint pairs in the model bank. To maintain an efficient and informative model bank, we discard checkpoints with the lowest average SW scores, replacing them with newly updated models. Our method was rigorously tested against existing test-time adaptation strategies across three datasets and eight types of corruptions, demonstrating superior adaptability to dynamic scenes and conditions. Notably, it achieved a 67.3% improvement in a challenging cross-corruption scenario, offering a more comprehensive benchmark for adaptation. Source code: https://github.com/zhuoxiao-chen/MOS. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.14878v3-abstract-full').style.display = 'none'; document.getElementById('2406.14878v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ICLR 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.08806">arXiv:2406.08806</a> <span> [<a href="https://arxiv.org/pdf/2406.08806">pdf</a>, <a href="https://arxiv.org/ps/2406.08806">ps</a>, <a href="https://arxiv.org/format/2406.08806">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Adaptive Cooperative Streaming of Holographic Video Over Wireless Networks: A Proximal Policy Optimization Solution </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wen%2C+W">Wanli Wen</a>, <a href="/search/eess?searchtype=author&query=Yan%2C+J">Jiping Yan</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Y">Yulu Zhang</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+Z">Zhen Huang</a>, <a href="/search/eess?searchtype=author&query=Liang%2C+L">Liang Liang</a>, <a href="/search/eess?searchtype=author&query=Jia%2C+Y">Yunjian Jia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.08806v1-abstract-short" style="display: inline;"> Adapting holographic video streaming to fluctuating wireless channels is essential to maintain consistent and satisfactory Quality of Experience (QoE) for users, which, however, is a challenging task due to the dynamic and uncertain characteristics of wireless networks. To address this issue, we propose a holographic video cooperative streaming framework designed for a generic wireless network in… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.08806v1-abstract-full').style.display = 'inline'; document.getElementById('2406.08806v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.08806v1-abstract-full" style="display: none;"> Adapting holographic video streaming to fluctuating wireless channels is essential to maintain consistent and satisfactory Quality of Experience (QoE) for users, which, however, is a challenging task due to the dynamic and uncertain characteristics of wireless networks. To address this issue, we propose a holographic video cooperative streaming framework designed for a generic wireless network in which multiple access points can cooperatively transmit video with different bitrates to multiple users. Additionally, we model a novel QoE metric tailored specifically for holographic video streaming, which can effectively encapsulate the nuances of holographic video quality, quality fluctuations, and rebuffering occurrences simultaneously. Furthermore, we formulate a formidable QoE maximization problem, which is a non-convex mixed integer nonlinear programming problem. Using proximal policy optimization (PPO), a new class of reinforcement learning algorithms, we devise a joint beamforming and bitrate control scheme, which can be wisely adapted to fluctuations in the wireless channel. The numerical results demonstrate the superiority of the proposed scheme over representative baselines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.08806v1-abstract-full').style.display = 'none'; document.getElementById('2406.08806v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper has been accepted for publication in IEEE Wireless Communications Letters</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Huang%2C+Z&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Huang%2C+Z&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Huang%2C+Z&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Huang%2C+Z&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Huang%2C+Z&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Huang%2C+Z&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>