Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 1,530 results for author: <span class="mathjax">Li, Y</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/eess" aria-role="search"> Searching in archive <strong>eess</strong>. <a href="/search/?searchtype=author&query=Li%2C+Y">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Li, Y"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Li%2C+Y&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Li, Y"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Li%2C+Y&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Li%2C+Y&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+Y&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+Y&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+Y&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+Y&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.06940">arXiv:2412.06940</a> <span> [<a href="https://arxiv.org/pdf/2412.06940">pdf</a>, <a href="https://arxiv.org/format/2412.06940">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Digital Twin-Empowered Voltage Control for Power Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Xu%2C+J">Jiachen Xu</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yushuai Li</a>, <a href="/search/eess?searchtype=author&query=Pedersen%2C+T+B">Torben Bach Pedersen</a>, <a href="/search/eess?searchtype=author&query=He%2C+Y">Yuqiang He</a>, <a href="/search/eess?searchtype=author&query=Larsen%2C+K+G">Kim Guldstrand Larsen</a>, <a href="/search/eess?searchtype=author&query=Li%2C+T">Tianyi Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.06940v1-abstract-short" style="display: inline;"> Emerging digital twin technology has the potential to revolutionize voltage control in power systems. However, the state-of-the-art digital twin method suffers from low computational and sampling efficiency, which hinders its applications. To address this issue, we propose a Gumbel-Consistency Digital Twin (GC-DT) method that enhances voltage control with improved computational and sampling effici… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06940v1-abstract-full').style.display = 'inline'; document.getElementById('2412.06940v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.06940v1-abstract-full" style="display: none;"> Emerging digital twin technology has the potential to revolutionize voltage control in power systems. However, the state-of-the-art digital twin method suffers from low computational and sampling efficiency, which hinders its applications. To address this issue, we propose a Gumbel-Consistency Digital Twin (GC-DT) method that enhances voltage control with improved computational and sampling efficiency. First, the proposed method incorporates a Gumbel-based strategy improvement that leverages the Gumbel-top trick to enhance non-repetitive sampling actions and reduce the reliance on Monte Carlo Tree Search simulations, thereby improving computational efficiency. Second, a consistency loss function aligns predicted hidden states with actual hidden states in the latent space, which increases both prediction accuracy and sampling efficiency. Experiments on IEEE 123-bus, 34-bus, and 13-bus systems demonstrate that the proposed GC-DT outperforms the state-of-the-art DT method in both computational and sampling efficiency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06940v1-abstract-full').style.display = 'none'; document.getElementById('2412.06940v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">6 pages, 1 figure, conference paper</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.06208">arXiv:2412.06208</a> <span> [<a href="https://arxiv.org/pdf/2412.06208">pdf</a>, <a href="https://arxiv.org/format/2412.06208">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Pilot-guided Multimodal Semantic Communication for Audio-Visual Event Localization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yu%2C+F">Fei Yu</a>, <a href="/search/eess?searchtype=author&query=Xiang%2C+Z">Zhe Xiang</a>, <a href="/search/eess?searchtype=author&query=Che%2C+N">Nan Che</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Z">Zhuoran Zhang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yuandi Li</a>, <a href="/search/eess?searchtype=author&query=Xue%2C+J">Junxiao Xue</a>, <a href="/search/eess?searchtype=author&query=Wan%2C+Z">Zhiguo Wan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.06208v1-abstract-short" style="display: inline;"> Multimodal semantic communication, which integrates various data modalities such as text, images, and audio, significantly enhances communication efficiency and reliability. It has broad application prospects in fields such as artificial intelligence, autonomous driving, and smart homes. However, current research primarily relies on analog channels and assumes constant channel states (perfect CSI)… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06208v1-abstract-full').style.display = 'inline'; document.getElementById('2412.06208v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.06208v1-abstract-full" style="display: none;"> Multimodal semantic communication, which integrates various data modalities such as text, images, and audio, significantly enhances communication efficiency and reliability. It has broad application prospects in fields such as artificial intelligence, autonomous driving, and smart homes. However, current research primarily relies on analog channels and assumes constant channel states (perfect CSI), which is inadequate for addressing dynamic physical channels and noise in real-world scenarios. Existing methods often focus on single modality tasks and fail to handle multimodal stream data, such as video and audio, and their corresponding tasks. Furthermore, current semantic encoding and decoding modules mainly transmit single modality features, neglecting the need for multimodal semantic enhancement and recognition tasks. To address these challenges, this paper proposes a pilot-guided framework for multimodal semantic communication specifically tailored for audio-visual event localization tasks. This framework utilizes digital pilot codes and channel modules to guide the state of analog channels in real-wold scenarios and designs Euler-based multimodal semantic encoding and decoding that consider time-frequency characteristics based on dynamic channel state. This approach effectively handles multimodal stream source data, especially for audio-visual event localization tasks. Extensive numerical experiments demonstrate the robustness of the proposed framework in channel changes and its support for various communication scenarios. The experimental results show that the framework outperforms existing benchmark methods in terms of Signal-to-Noise Ratio (SNR), highlighting its advantage in semantic communication quality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06208v1-abstract-full').style.display = 'none'; document.getElementById('2412.06208v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.06001">arXiv:2412.06001</a> <span> [<a href="https://arxiv.org/pdf/2412.06001">pdf</a>, <a href="https://arxiv.org/format/2412.06001">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> M6: Multi-generator, Multi-domain, Multi-lingual and cultural, Multi-genres, Multi-instrument Machine-Generated Music Detection Databases </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yupei Li</a>, <a href="/search/eess?searchtype=author&query=Li%2C+H">Hanqian Li</a>, <a href="/search/eess?searchtype=author&query=Specia%2C+L">Lucia Specia</a>, <a href="/search/eess?searchtype=author&query=Schuller%2C+B+W">Bj枚rn W. Schuller</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.06001v1-abstract-short" style="display: inline;"> Machine-generated music (MGM) has emerged as a powerful tool with applications in music therapy, personalised editing, and creative inspiration for the music community. However, its unregulated use threatens the entertainment, education, and arts sectors by diminishing the value of high-quality human compositions. Detecting machine-generated music (MGMD) is, therefore, critical to safeguarding the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06001v1-abstract-full').style.display = 'inline'; document.getElementById('2412.06001v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.06001v1-abstract-full" style="display: none;"> Machine-generated music (MGM) has emerged as a powerful tool with applications in music therapy, personalised editing, and creative inspiration for the music community. However, its unregulated use threatens the entertainment, education, and arts sectors by diminishing the value of high-quality human compositions. Detecting machine-generated music (MGMD) is, therefore, critical to safeguarding these domains, yet the field lacks comprehensive datasets to support meaningful progress. To address this gap, we introduce \textbf{M6}, a large-scale benchmark dataset tailored for MGMD research. M6 is distinguished by its diversity, encompassing multiple generators, domains, languages, cultural contexts, genres, and instruments. We outline our methodology for data selection and collection, accompanied by detailed data analysis, providing all WAV form of music. Additionally, we provide baseline performance scores using foundational binary classification models, illustrating the complexity of MGMD and the significant room for improvement. By offering a robust and multifaceted resource, we aim to empower future research to develop more effective detection methods for MGM. We believe M6 will serve as a critical step toward addressing this societal challenge. The dataset and code will be freely available to support open collaboration and innovation in this field. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06001v1-abstract-full').style.display = 'none'; document.getElementById('2412.06001v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.05957">arXiv:2412.05957</a> <span> [<a href="https://arxiv.org/pdf/2412.05957">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> A Two-Stage AI-Powered Motif Mining Method for Efficient Power System Topological Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yiyan Li</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+Z">Zhenghao Zhou</a>, <a href="/search/eess?searchtype=author&query=Ping%2C+J">Jian Ping</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+X">Xiaoyuan Xu</a>, <a href="/search/eess?searchtype=author&query=Yan%2C+Z">Zheng Yan</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+J">Jianzhong Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.05957v1-abstract-short" style="display: inline;"> Graph motif, defined as the microstructure that appears repeatedly in a large graph, reveals important topological characteristics of the large graph and has gained increasing attention in power system analysis regarding reliability, vulnerability and resiliency. However, searching motifs within the large-scale power system is extremely computationally challenging and even infeasible, which underm… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05957v1-abstract-full').style.display = 'inline'; document.getElementById('2412.05957v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.05957v1-abstract-full" style="display: none;"> Graph motif, defined as the microstructure that appears repeatedly in a large graph, reveals important topological characteristics of the large graph and has gained increasing attention in power system analysis regarding reliability, vulnerability and resiliency. However, searching motifs within the large-scale power system is extremely computationally challenging and even infeasible, which undermines the value of motif analysis in practice. In this paper, we introduce a two-stage AI-powered motif mining method to enable efficient and wide-range motif analysis in power systems. In the first stage, a representation learning method with specially designed network structure and loss function is proposed to achieve ordered embedding for the power system topology, simplifying the subgraph isomorphic problem into a vector comparison problem. In the second stage, under the guidance of the ordered embedding space, a greedy-search-based motif growing algorithm is introduced to quickly obtain the motifs without traversal searching. A case study based on a power system database including 61 circuit models demonstrates the effectiveness of the proposed method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05957v1-abstract-full').style.display = 'none'; document.getElementById('2412.05957v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to Applied Energy</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.05694">arXiv:2412.05694</a> <span> [<a href="https://arxiv.org/pdf/2412.05694">pdf</a>, <a href="https://arxiv.org/format/2412.05694">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Combining Genre Classification and Harmonic-Percussive Features with Diffusion Models for Music-Video Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Pina%2C+L">Leonardo Pina</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yongmin Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.05694v1-abstract-short" style="display: inline;"> This study presents a novel method for generating music visualisers using diffusion models, combining audio input with user-selected artwork. The process involves two main stages: image generation and video creation. First, music captioning and genre classification are performed, followed by the retrieval of artistic style descriptions. A diffusion model then generates images based on the user's i… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05694v1-abstract-full').style.display = 'inline'; document.getElementById('2412.05694v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.05694v1-abstract-full" style="display: none;"> This study presents a novel method for generating music visualisers using diffusion models, combining audio input with user-selected artwork. The process involves two main stages: image generation and video creation. First, music captioning and genre classification are performed, followed by the retrieval of artistic style descriptions. A diffusion model then generates images based on the user's input image and the derived artistic style descriptions. The video generation stage utilises the same diffusion model to interpolate frames, controlled by audio energy vectors derived from key musical features of harmonics and percussives. The method demonstrates promising results across various genres, and a new metric, Audio-Visual Synchrony (AVS), is introduced to quantitatively evaluate the synchronisation between visual and audio elements. Comparative analysis shows significantly higher AVS values for videos generated using the proposed method with audio energy vectors, compared to linear interpolation. This approach has potential applications in diverse fields, including independent music video creation, film production, live music events, and enhancing audio-visual experiences in public spaces. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05694v1-abstract-full').style.display = 'none'; document.getElementById('2412.05694v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.05647">arXiv:2412.05647</a> <span> [<a href="https://arxiv.org/pdf/2412.05647">pdf</a>, <a href="https://arxiv.org/ps/2412.05647">ps</a>, <a href="https://arxiv.org/format/2412.05647">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Deep Reinforcement Learning-Based Resource Allocation for Hybrid Bit and Generative Semantic Communications in Space-Air-Ground Integrated Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Huang%2C+C">Chong Huang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+X">Xuyang Chen</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+G">Gaojie Chen</a>, <a href="/search/eess?searchtype=author&query=Xiao%2C+P">Pei Xiao</a>, <a href="/search/eess?searchtype=author&query=Li%2C+G+Y">Geoffrey Ye Li</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+W">Wei Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.05647v1-abstract-short" style="display: inline;"> In this paper, we introduce a novel framework consisting of hybrid bit-level and generative semantic communications for efficient downlink image transmission within space-air-ground integrated networks (SAGINs). The proposed model comprises multiple low Earth orbit (LEO) satellites, unmanned aerial vehicles (UAVs), and ground users. Considering the limitations in signal coverage and receiver anten… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05647v1-abstract-full').style.display = 'inline'; document.getElementById('2412.05647v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.05647v1-abstract-full" style="display: none;"> In this paper, we introduce a novel framework consisting of hybrid bit-level and generative semantic communications for efficient downlink image transmission within space-air-ground integrated networks (SAGINs). The proposed model comprises multiple low Earth orbit (LEO) satellites, unmanned aerial vehicles (UAVs), and ground users. Considering the limitations in signal coverage and receiver antennas that make the direct communication between satellites and ground users unfeasible in many scenarios, thus UAVs serve as relays and forward images from satellites to the ground users. Our hybrid communication framework effectively combines bit-level transmission with several semantic-level image generation modes, optimizing bandwidth usage to meet stringent satellite link budget constraints and ensure communication reliability and low latency under low signal-to-noise ratio (SNR) conditions. To reduce the transmission delay while ensuring the reconstruction quality at the ground user, we propose a novel metric for measuring delay and reconstruction quality in the proposed system, and employ a deep reinforcement learning (DRL)-based strategy to optimize the resource in the proposed network. Simulation results demonstrate the superiority of the proposed framework in terms of communication resource conservation, reduced latency, and maintaining high image quality, significantly outperforming traditional solutions. Therefore, the proposed framework can ensure that real-time image transmission requirements in SAGINs, even under dynamic network conditions and user demand. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05647v1-abstract-full').style.display = 'none'; document.getElementById('2412.05647v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.05084">arXiv:2412.05084</a> <span> [<a href="https://arxiv.org/pdf/2412.05084">pdf</a>, <a href="https://arxiv.org/format/2412.05084">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Medical Physics">physics.med-ph</span> </div> </div> <p class="title is-5 mathjax"> Reconstructing Quantitative Cerebral Perfusion Images Directly From Measured Sinogram Data Acquired Using C-arm Cone-Beam CT </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhao%2C+H">Haotian Zhao</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+R">Ruifeng Chen</a>, <a href="/search/eess?searchtype=author&query=Yan%2C+J">Jing Yan</a>, <a href="/search/eess?searchtype=author&query=Feng%2C+J">Juan Feng</a>, <a href="/search/eess?searchtype=author&query=Xiang%2C+J">Jun Xiang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yang Chen</a>, <a href="/search/eess?searchtype=author&query=Liang%2C+D">Dong Liang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yinsheng Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.05084v1-abstract-short" style="display: inline;"> To shorten the door-to-puncture time for better treating patients with acute ischemic stroke, it is highly desired to obtain quantitative cerebral perfusion images using C-arm cone-beam computed tomography (CBCT) equipped in the interventional suite. However, limited by the slow gantry rotation speed, the temporal resolution and temporal sampling density of typical C-arm CBCT are much poorer than… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05084v1-abstract-full').style.display = 'inline'; document.getElementById('2412.05084v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.05084v1-abstract-full" style="display: none;"> To shorten the door-to-puncture time for better treating patients with acute ischemic stroke, it is highly desired to obtain quantitative cerebral perfusion images using C-arm cone-beam computed tomography (CBCT) equipped in the interventional suite. However, limited by the slow gantry rotation speed, the temporal resolution and temporal sampling density of typical C-arm CBCT are much poorer than those of multi-detector-row CT in the diagnostic imaging suite. The current quantitative perfusion imaging includes two cascaded steps: time-resolved image reconstruction and perfusion parametric estimation. For time-resolved image reconstruction, the technical challenge imposed by poor temporal resolution and poor sampling density causes inaccurate quantification of the temporal variation of cerebral artery and tissue attenuation values. For perfusion parametric estimation, it remains a technical challenge to appropriately design the handcrafted regularization for better solving the associated deconvolution problem. These two challenges together prevent obtaining quantitatively accurate perfusion images using C-arm CBCT. The purpose of this work is to simultaneously address these two challenges by combining the two cascaded steps into a single joint optimization problem and reconstructing quantitative perfusion images directly from the measured sinogram data. In the developed direct cerebral perfusion parametric image reconstruction technique, TRAINER in short, the quantitative perfusion images have been represented as a subject-specific conditional generative model trained under the constraint of the time-resolved CT forward model, perfusion convolutional model, and the subject's own measured sinogram data. Results shown in this paper demonstrated that using TRAINER, quantitative cerebral perfusion images can be accurately obtained using C-arm CBCT in the interventional suite. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05084v1-abstract-full').style.display = 'none'; document.getElementById('2412.05084v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.04802">arXiv:2412.04802</a> <span> [<a href="https://arxiv.org/pdf/2412.04802">pdf</a>, <a href="https://arxiv.org/format/2412.04802">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Modality Decoupling is All You Need: A Simple Solution for Unsupervised Hyperspectral Image Fusion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Du%2C+S">Songcheng Du</a>, <a href="/search/eess?searchtype=author&query=Zou%2C+Y">Yang Zou</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Z">Zixu Wang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+X">Xingyuan Li</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Ying Li</a>, <a href="/search/eess?searchtype=author&query=Shen%2C+Q">Qiang Shen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.04802v1-abstract-short" style="display: inline;"> Hyperspectral Image Fusion (HIF) aims to fuse low-resolution hyperspectral images (LR-HSIs) and high-resolution multispectral images (HR-MSIs) to reconstruct high spatial and high spectral resolution images. Current methods typically apply direct fusion from the two modalities without valid supervision, failing to fully perceive the deep modality-complementary information and hence, resulting in a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.04802v1-abstract-full').style.display = 'inline'; document.getElementById('2412.04802v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.04802v1-abstract-full" style="display: none;"> Hyperspectral Image Fusion (HIF) aims to fuse low-resolution hyperspectral images (LR-HSIs) and high-resolution multispectral images (HR-MSIs) to reconstruct high spatial and high spectral resolution images. Current methods typically apply direct fusion from the two modalities without valid supervision, failing to fully perceive the deep modality-complementary information and hence, resulting in a superficial understanding of inter-modality connections. To bridge this gap, we propose a simple and effective solution for unsupervised HIF with an assumption that modality decoupling is essential for HIF. We introduce the modality clustering loss that ensures clear guidance of the modality, decoupling towards modality-shared features while steering clear of modality-complementary ones. Also, we propose an end-to-end Modality-Decoupled Spatial-Spectral Fusion (MossFuse) framework that decouples shared and complementary information across modalities and aggregates a concise representation of the LR-HSI and HR-MSI to reduce the modality redundancy. Systematic experiments over multiple datasets demonstrate that our simple and effective approach consistently outperforms the existing HIF methods while requiring considerably fewer parameters with reduced inference time. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.04802v1-abstract-full').style.display = 'none'; document.getElementById('2412.04802v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.04746">arXiv:2412.04746</a> <span> [<a href="https://arxiv.org/pdf/2412.04746">pdf</a>, <a href="https://arxiv.org/format/2412.04746">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Diff4Steer: Steerable Diffusion Prior for Generative Music Retrieval with Semantic Guidance </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Bao%2C+X">Xuchan Bao</a>, <a href="/search/eess?searchtype=author&query=Li%2C+J+Y">Judith Yue Li</a>, <a href="/search/eess?searchtype=author&query=Wan%2C+Z+Y">Zhong Yi Wan</a>, <a href="/search/eess?searchtype=author&query=Su%2C+K">Kun Su</a>, <a href="/search/eess?searchtype=author&query=Denk%2C+T">Timo Denk</a>, <a href="/search/eess?searchtype=author&query=Lee%2C+J">Joonseok Lee</a>, <a href="/search/eess?searchtype=author&query=Kuzmin%2C+D">Dima Kuzmin</a>, <a href="/search/eess?searchtype=author&query=Sha%2C+F">Fei Sha</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.04746v1-abstract-short" style="display: inline;"> Modern music retrieval systems often rely on fixed representations of user preferences, limiting their ability to capture users' diverse and uncertain retrieval needs. To address this limitation, we introduce Diff4Steer, a novel generative retrieval framework that employs lightweight diffusion models to synthesize diverse seed embeddings from user queries that represent potential directions for mu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.04746v1-abstract-full').style.display = 'inline'; document.getElementById('2412.04746v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.04746v1-abstract-full" style="display: none;"> Modern music retrieval systems often rely on fixed representations of user preferences, limiting their ability to capture users' diverse and uncertain retrieval needs. To address this limitation, we introduce Diff4Steer, a novel generative retrieval framework that employs lightweight diffusion models to synthesize diverse seed embeddings from user queries that represent potential directions for music exploration. Unlike deterministic methods that map user query to a single point in embedding space, Diff4Steer provides a statistical prior on the target modality (audio) for retrieval, effectively capturing the uncertainty and multi-faceted nature of user preferences. Furthermore, Diff4Steer can be steered by image or text inputs, enabling more flexible and controllable music discovery combined with nearest neighbor search. Our framework outperforms deterministic regression methods and LLM-based generative retrieval baseline in terms of retrieval and ranking metrics, demonstrating its effectiveness in capturing user preferences, leading to more diverse and relevant recommendations. Listening examples are available at tinyurl.com/diff4steer. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.04746v1-abstract-full').style.display = 'none'; document.getElementById('2412.04746v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2024 Creative AI Track</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.04008">arXiv:2412.04008</a> <span> [<a href="https://arxiv.org/pdf/2412.04008">pdf</a>, <a href="https://arxiv.org/format/2412.04008">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> </div> </div> <p class="title is-5 mathjax"> Deep-Unrolling Multidimensional Harmonic Retrieval Algorithms on Neuromorphic Hardware </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Andrei%2C+V+C">Vlad C. Andrei</a>, <a href="/search/eess?searchtype=author&query=Dr%C4%83gu%C5%A3oiu%2C+A+P">Alexandru P. Dr膬gu牛oiu</a>, <a href="/search/eess?searchtype=author&query=B%C3%A9na%2C+G">Gabriel B茅na</a>, <a href="/search/eess?searchtype=author&query=Akl%2C+M">Mahmoud Akl</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yin Li</a>, <a href="/search/eess?searchtype=author&query=Lohrmann%2C+M">Matthias Lohrmann</a>, <a href="/search/eess?searchtype=author&query=M%C3%B6nich%2C+U+J">Ullrich J. M枚nich</a>, <a href="/search/eess?searchtype=author&query=Boche%2C+H">Holger Boche</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.04008v1-abstract-short" style="display: inline;"> This paper explores the potential of conversion-based neuromorphic algorithms for highly accurate and energy-efficient single-snapshot multidimensional harmonic retrieval (MHR). By casting the MHR problem as a sparse recovery problem, we devise the currently proposed, deep-unrolling-based Structured Learned Iterative Shrinkage and Thresholding (S-LISTA) algorithm to solve it efficiently using comp… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.04008v1-abstract-full').style.display = 'inline'; document.getElementById('2412.04008v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.04008v1-abstract-full" style="display: none;"> This paper explores the potential of conversion-based neuromorphic algorithms for highly accurate and energy-efficient single-snapshot multidimensional harmonic retrieval (MHR). By casting the MHR problem as a sparse recovery problem, we devise the currently proposed, deep-unrolling-based Structured Learned Iterative Shrinkage and Thresholding (S-LISTA) algorithm to solve it efficiently using complex-valued convolutional neural networks with complex-valued activations, which are trained using a supervised regression objective. Afterward, a novel method for converting the complex-valued convolutional layers and activations into spiking neural networks (SNNs) is developed. At the heart of this method lies the recently proposed Few Spikes (FS) conversion, which is extended by modifying the neuron model's parameters and internal dynamics to account for the inherent coupling between real and imaginary parts in complex-valued computations. Finally, the converted SNNs are mapped onto the SpiNNaker2 neuromorphic board, and a comparison in terms of estimation accuracy and power efficiency between the original CNNs deployed on an NVIDIA Jetson Xavier and the SNNs is being conducted. The measurement results show that the converted SNNs achieve almost five-fold power efficiency at moderate performance loss compared to the original CNNs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.04008v1-abstract-full').style.display = 'none'; document.getElementById('2412.04008v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted to the 58th Asilomar Conference on Signals, Systems, and Computers, Oct. 27th - Oct. 30th, 2024, Pacific Grove, CA</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.03874">arXiv:2412.03874</a> <span> [<a href="https://arxiv.org/pdf/2412.03874">pdf</a>, <a href="https://arxiv.org/format/2412.03874">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Learning Based MPC for Autonomous Driving Using a Low Dimensional Residual Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yaoyu Li</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+C">Chaosheng Huang</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+D">Dongsheng Yang</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+W">Wenbo Liu</a>, <a href="/search/eess?searchtype=author&query=Li%2C+J">Jun Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.03874v1-abstract-short" style="display: inline;"> In this paper, a learning based Model Predictive Control (MPC) using a low dimensional residual model is proposed for autonomous driving. One of the critical challenge in autonomous driving is the complexity of vehicle dynamics, which impedes the formulation of accurate vehicle model. Inaccurate vehicle model can significantly impact the performance of MPC controller. To address this issue, this p… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.03874v1-abstract-full').style.display = 'inline'; document.getElementById('2412.03874v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.03874v1-abstract-full" style="display: none;"> In this paper, a learning based Model Predictive Control (MPC) using a low dimensional residual model is proposed for autonomous driving. One of the critical challenge in autonomous driving is the complexity of vehicle dynamics, which impedes the formulation of accurate vehicle model. Inaccurate vehicle model can significantly impact the performance of MPC controller. To address this issue, this paper decomposes the nominal vehicle model into invariable and variable elements. The accuracy of invariable component is ensured by calibration, while the deviations in the variable elements are learned by a low-dimensional residual model. The features of residual model are selected as the physical variables most correlated with nominal model errors. Physical constraints among these features are formulated to explicitly define the valid region within the feature space. The formulated model and constraints are incorporated into the MPC framework and validated through both simulation and real vehicle experiments. The results indicate that the proposed method significantly enhances the model accuracy and controller performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.03874v1-abstract-full').style.display = 'none'; document.getElementById('2412.03874v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages, 11 figures, 4 tables. Submitted to IEEE ICRA 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.03121">arXiv:2412.03121</a> <span> [<a href="https://arxiv.org/pdf/2412.03121">pdf</a>, <a href="https://arxiv.org/format/2412.03121">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Splats in Splats: Embedding Invisible 3D Watermark within Gaussian Splatting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Guo%2C+Y">Yijia Guo</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+W">Wenkai Huang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yang Li</a>, <a href="/search/eess?searchtype=author&query=Li%2C+G">Gaolei Li</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+H">Hang Zhang</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+L">Liwen Hu</a>, <a href="/search/eess?searchtype=author&query=Li%2C+J">Jianhua Li</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+T">Tiejun Huang</a>, <a href="/search/eess?searchtype=author&query=Ma%2C+L">Lei Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.03121v1-abstract-short" style="display: inline;"> 3D Gaussian splatting (3DGS) has demonstrated impressive 3D reconstruction performance with explicit scene representations. Given the widespread application of 3DGS in 3D reconstruction and generation tasks, there is an urgent need to protect the copyright of 3DGS assets. However, existing copyright protection techniques for 3DGS overlook the usability of 3D assets, posing challenges for practical… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.03121v1-abstract-full').style.display = 'inline'; document.getElementById('2412.03121v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.03121v1-abstract-full" style="display: none;"> 3D Gaussian splatting (3DGS) has demonstrated impressive 3D reconstruction performance with explicit scene representations. Given the widespread application of 3DGS in 3D reconstruction and generation tasks, there is an urgent need to protect the copyright of 3DGS assets. However, existing copyright protection techniques for 3DGS overlook the usability of 3D assets, posing challenges for practical deployment. Here we describe WaterGS, the first 3DGS watermarking framework that embeds 3D content in 3DGS itself without modifying any attributes of the vanilla 3DGS. To achieve this, we take a deep insight into spherical harmonics (SH) and devise an importance-graded SH coefficient encryption strategy to embed the hidden SH coefficients. Furthermore, we employ a convolutional autoencoder to establish a mapping between the original Gaussian primitives' opacity and the hidden Gaussian primitives' opacity. Extensive experiments indicate that WaterGS significantly outperforms existing 3D steganography techniques, with 5.31% higher scene fidelity and 3X faster rendering speed, while ensuring security, robustness, and user experience. Codes and data will be released at https://water-gs.github.io. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.03121v1-abstract-full').style.display = 'none'; document.getElementById('2412.03121v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.01463">arXiv:2412.01463</a> <span> [<a href="https://arxiv.org/pdf/2412.01463">pdf</a>, <a href="https://arxiv.org/format/2412.01463">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Learning Differential Pyramid Representation for Tone Mapping </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yang%2C+Q">Qirui Yang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yinbo Li</a>, <a href="/search/eess?searchtype=author&query=Jiang%2C+P">Peng-Tao Jiang</a>, <a href="/search/eess?searchtype=author&query=Cheng%2C+Q">Qihua Cheng</a>, <a href="/search/eess?searchtype=author&query=Yu%2C+B">Biting Yu</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+Y">Yihao Liu</a>, <a href="/search/eess?searchtype=author&query=Yue%2C+H">Huanjing Yue</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+J">Jingyu Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.01463v1-abstract-short" style="display: inline;"> Previous tone mapping methods mainly focus on how to enhance tones in low-resolution images and recover details using the high-frequent components extracted from the input image. These methods typically rely on traditional feature pyramids to artificially extract high-frequency components, such as Laplacian and Gaussian pyramids with handcrafted kernels. However, traditional handcrafted features s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.01463v1-abstract-full').style.display = 'inline'; document.getElementById('2412.01463v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.01463v1-abstract-full" style="display: none;"> Previous tone mapping methods mainly focus on how to enhance tones in low-resolution images and recover details using the high-frequent components extracted from the input image. These methods typically rely on traditional feature pyramids to artificially extract high-frequency components, such as Laplacian and Gaussian pyramids with handcrafted kernels. However, traditional handcrafted features struggle to effectively capture the high-frequency components in HDR images, resulting in excessive smoothing and loss of detail in the output image. To mitigate the above issue, we introduce a learnable Differential Pyramid Representation Network (DPRNet). Based on the learnable differential pyramid, our DPRNet can capture detailed textures and structures, which is crucial for high-quality tone mapping recovery. In addition, to achieve global consistency and local contrast harmonization, we design a global tone perception module and a local tone tuning module that ensure the consistency of global tuning and the accuracy of local tuning, respectively. Extensive experiments demonstrate that our method significantly outperforms state-of-the-art methods, improving PSNR by 2.58 dB in the HDR+ dataset and 3.31 dB in the HDRI Haven dataset respectively compared with the second-best method. Notably, our method exhibits the best generalization ability in the non-homologous image and video tone mapping operation. We provide an anonymous online demo at https://xxxxxx2024.github.io/DPRNet/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.01463v1-abstract-full').style.display = 'none'; document.getElementById('2412.01463v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.01279">arXiv:2412.01279</a> <span> [<a href="https://arxiv.org/pdf/2412.01279">pdf</a>, <a href="https://arxiv.org/format/2412.01279">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> IMNet: Interference-Aware Channel Knowledge Map Construction and Localization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhao%2C+L">Le Zhao</a>, <a href="/search/eess?searchtype=author&query=Fei%2C+Z">Zesong Fei</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+X">Xinyi Wang</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+J">Jingxuan Huang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yuan Li</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Y">Yan Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.01279v1-abstract-short" style="display: inline;"> This paper presents a novel two-stage method for constructing channel knowledge maps (CKMs) specifically for A2G (Aerial-to-Ground) channels in the presence of non-cooperative interfering nodes (INs). We first estimate the interfering signal strength (ISS) at sampling locations based on total received signal strength measurements and the desired communication signal strength (DSS) map constructed… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.01279v1-abstract-full').style.display = 'inline'; document.getElementById('2412.01279v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.01279v1-abstract-full" style="display: none;"> This paper presents a novel two-stage method for constructing channel knowledge maps (CKMs) specifically for A2G (Aerial-to-Ground) channels in the presence of non-cooperative interfering nodes (INs). We first estimate the interfering signal strength (ISS) at sampling locations based on total received signal strength measurements and the desired communication signal strength (DSS) map constructed with environmental topology. Next, an ISS map construction network (IMNet) is proposed, where a negative value correction module is included to enable precise reconstruction. Subsequently, we further execute signal-to-interference-plus-noise ratio map construction and IN localization. Simulation results demonstrate lower construction error of the proposed IMNet compared to baselines in the presence of interference. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.01279v1-abstract-full').style.display = 'none'; document.getElementById('2412.01279v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 4 figures, submitted to IEEE journals for possible publication</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.00571">arXiv:2412.00571</a> <span> [<a href="https://arxiv.org/pdf/2412.00571">pdf</a>, <a href="https://arxiv.org/format/2412.00571">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> From Audio Deepfake Detection to AI-Generated Music Detection -- A Pathway and Overview </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yupei Li</a>, <a href="/search/eess?searchtype=author&query=Milling%2C+M">Manuel Milling</a>, <a href="/search/eess?searchtype=author&query=Specia%2C+L">Lucia Specia</a>, <a href="/search/eess?searchtype=author&query=Schuller%2C+B+W">Bj枚rn W. Schuller</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.00571v2-abstract-short" style="display: inline;"> As Artificial Intelligence (AI) technologies continue to evolve, their use in generating realistic, contextually appropriate content has expanded into various domains. Music, an art form and medium for entertainment, deeply rooted into human culture, is seeing an increased involvement of AI into its production. However, despite the effective application of AI music generation (AIGM) tools, the unr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.00571v2-abstract-full').style.display = 'inline'; document.getElementById('2412.00571v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.00571v2-abstract-full" style="display: none;"> As Artificial Intelligence (AI) technologies continue to evolve, their use in generating realistic, contextually appropriate content has expanded into various domains. Music, an art form and medium for entertainment, deeply rooted into human culture, is seeing an increased involvement of AI into its production. However, despite the effective application of AI music generation (AIGM) tools, the unregulated use of them raises concerns about potential negative impacts on the music industry, copyright and artistic integrity, underscoring the importance of effective AIGM detection. This paper provides an overview of existing AIGM detection methods. To lay a foundation to the general workings and challenges of AIGM detection, we first review general principles of AIGM, including recent advancements in deepfake audios, as well as multimodal detection techniques. We further propose a potential pathway for leveraging foundation models from audio deepfake detection to AIGM detection. Additionally, we discuss implications of these tools and propose directions for future research to address ongoing challenges in the field. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.00571v2-abstract-full').style.display = 'none'; document.getElementById('2412.00571v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.19385">arXiv:2411.19385</a> <span> [<a href="https://arxiv.org/pdf/2411.19385">pdf</a>, <a href="https://arxiv.org/format/2411.19385">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Zero-Forget Preservation of Semantic Communication Alignment in Distributed AI Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Hu%2C+J">Jingzhi Hu</a>, <a href="/search/eess?searchtype=author&query=Li%2C+G+Y">Geoffrey Ye Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.19385v1-abstract-short" style="display: inline;"> Future communication networks are expected to connect massive distributed artificial intelligence (AI). Exploiting aligned priori knowledge of AI pairs, it is promising to convert high-dimensional data transmission into highly-compressed semantic communications (SC). However, to accommodate the local data distribution and user preferences, AIs generally adapt to different domains, which fundamenta… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.19385v1-abstract-full').style.display = 'inline'; document.getElementById('2411.19385v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.19385v1-abstract-full" style="display: none;"> Future communication networks are expected to connect massive distributed artificial intelligence (AI). Exploiting aligned priori knowledge of AI pairs, it is promising to convert high-dimensional data transmission into highly-compressed semantic communications (SC). However, to accommodate the local data distribution and user preferences, AIs generally adapt to different domains, which fundamentally distorts the SC alignment. In this paper, we propose a zero-forget domain adaptation (ZFDA) framework to preserve SC alignment. To prevent the DA from changing substantial neural parameters of AI, we design sparse additive modifications (SAM) to the parameters, which can be efficiently stored and switched-off to restore the SC alignment. To optimize the SAM, we decouple it into tractable continuous variables and a binary mask, and then handle the binary mask by a score-based optimization. Experimental evaluations on a SC system for image transmissions validate that the proposed framework perfectly preserves the SC alignment with almost no loss of DA performance, even improved in some cases, at a cost of less than 1% of additional memory. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.19385v1-abstract-full').style.display = 'none'; document.getElementById('2411.19385v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.18918">arXiv:2411.18918</a> <span> [<a href="https://arxiv.org/pdf/2411.18918">pdf</a>, <a href="https://arxiv.org/format/2411.18918">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> CoDiff-VC: A Codec-Assisted Diffusion Model for Zero-shot Voice Conversion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yuke Li</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+X">Xinfa Zhu</a>, <a href="/search/eess?searchtype=author&query=Li%2C+H">Hanzhao Li</a>, <a href="/search/eess?searchtype=author&query=Yao%2C+J">JiXun Yao</a>, <a href="/search/eess?searchtype=author&query=Tian%2C+W">WenJie Tian</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+X">XiPeng Yang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">YunLin Chen</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Z">Zhifei Li</a>, <a href="/search/eess?searchtype=author&query=Xie%2C+L">Lei Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.18918v3-abstract-short" style="display: inline;"> Zero-shot voice conversion (VC) aims to convert the original speaker's timbre to any target speaker while keeping the linguistic content. Current mainstream zero-shot voice conversion approaches depend on pre-trained recognition models to disentangle linguistic content and speaker representation. This results in a timbre residue within the decoupled linguistic content and inadequacies in speaker r… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18918v3-abstract-full').style.display = 'inline'; document.getElementById('2411.18918v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.18918v3-abstract-full" style="display: none;"> Zero-shot voice conversion (VC) aims to convert the original speaker's timbre to any target speaker while keeping the linguistic content. Current mainstream zero-shot voice conversion approaches depend on pre-trained recognition models to disentangle linguistic content and speaker representation. This results in a timbre residue within the decoupled linguistic content and inadequacies in speaker representation modeling. In this study, we propose CoDiff-VC, an end-to-end framework for zero-shot voice conversion that integrates a speech codec and a diffusion model to produce high-fidelity waveforms. Our approach involves employing a single-codebook codec to separate linguistic content from the source speech. To enhance content disentanglement, we introduce Mix-Style layer normalization (MSLN) to perturb the original timbre. Additionally, we incorporate a multi-scale speaker timbre modeling approach to ensure timbre consistency and improve voice detail similarity. To improve speech quality and speaker similarity, we introduce dual classifier-free guidance, providing both content and timbre guidance during the generation process. Objective and subjective experiments affirm that CoDiff-VC significantly improves speaker similarity, generating natural and higher-quality speech. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18918v3-abstract-full').style.display = 'none'; document.getElementById('2411.18918v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.18853">arXiv:2411.18853</a> <span> [<a href="https://arxiv.org/pdf/2411.18853">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Self-Adaptive Active Damping Method for Stability Enhancement of Systems With Black-Box Inverters Considering Operating Points </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yang Li</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+X">Xiangyang Wu</a>, <a href="/search/eess?searchtype=author&query=Shuai%2C+Z">Zhikang Shuai</a>, <a href="/search/eess?searchtype=author&query=Fang%2C+J">Junbin Fang</a>, <a href="/search/eess?searchtype=author&query=He%2C+L">Lili He</a>, <a href="/search/eess?searchtype=author&query=Lei%2C+Y">Yi Lei</a>, <a href="/search/eess?searchtype=author&query=Shen%2C+Z+J">Z. John Shen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.18853v1-abstract-short" style="display: inline;"> Due to the black-box nature of inverters and the wide variation range of operating points, it is challenging to on-line predict and adaptively enhance the stability of inverter-based systems. To solve this problem, this paper provides a feasible self-adaptive active damping method to eliminate potential small-signal instability of systems with black-box inverters under multiple operating points. F… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18853v1-abstract-full').style.display = 'inline'; document.getElementById('2411.18853v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.18853v1-abstract-full" style="display: none;"> Due to the black-box nature of inverters and the wide variation range of operating points, it is challenging to on-line predict and adaptively enhance the stability of inverter-based systems. To solve this problem, this paper provides a feasible self-adaptive active damping method to eliminate potential small-signal instability of systems with black-box inverters under multiple operating points. First, the framework that includes grid impedance estimation, inverters' admittance identification, and self-adaptive strategy is presented. Second, a widely-applicable and engineering-friendly method for inductive-resistive grid impedance estimation is studied, in which a frequency-integral-based dq-axis aligning method is presented to avoid the inaccuracy resulting from the disturbance theta. Then, to make the system have a sufficient stable margin under different operating points, a self-adaptive active damper (SAD) as well as its control strategy with lag compensator modification is proposed, in which the SAD's damping compensation mechanism for the system's stability enhancement is investigated and revealed. Finally, the mapping between system's parameter variations and SAD's parameters is established based on the artificial neural network (ANN) technique, serving as a computationally light model surrogate that is favorable for on-line parameter-tuning for SAD to compensate the system's damping according to operating points. The effectiveness of the proposed method is verified by simulations in PSACD/EMTDC and experiments in RT-Lab platforms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18853v1-abstract-full').style.display = 'none'; document.getElementById('2411.18853v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.18153">arXiv:2411.18153</a> <span> [<a href="https://arxiv.org/pdf/2411.18153">pdf</a>, <a href="https://arxiv.org/ps/2411.18153">ps</a>, <a href="https://arxiv.org/format/2411.18153">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Learning Rate-Compatible Linear Block Codes: An Auto-Encoder Based Approach </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Cheng%2C+Y">Yukun Cheng</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+W">Wei Chen</a>, <a href="/search/eess?searchtype=author&query=Hou%2C+T">Tianwei Hou</a>, <a href="/search/eess?searchtype=author&query=Li%2C+G+Y">Geoffrey Ye Li</a>, <a href="/search/eess?searchtype=author&query=Ai%2C+B">Bo Ai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.18153v1-abstract-short" style="display: inline;"> Artificial intelligence (AI) provides an alternative way to design channel coding with affordable complexity. However, most existing studies can only learn codes for a given size and rate, typically defined by a fixed network architecture and a set of parameters. The support of multiple code rates is essential for conserving bandwidth under varying channel conditions while it is costly to store mu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18153v1-abstract-full').style.display = 'inline'; document.getElementById('2411.18153v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.18153v1-abstract-full" style="display: none;"> Artificial intelligence (AI) provides an alternative way to design channel coding with affordable complexity. However, most existing studies can only learn codes for a given size and rate, typically defined by a fixed network architecture and a set of parameters. The support of multiple code rates is essential for conserving bandwidth under varying channel conditions while it is costly to store multiple AI models or parameter sets. In this article, we propose an auto-encoder (AE) based rate-compatible linear block codes (RC-LBCs). The coding process associated with AI or non-AI decoders and multiple puncturing patterns is optimized in a data-driven manner. The superior performance of the proposed AI-based RC-LBC is demonstrated through our numerical experiments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18153v1-abstract-full').style.display = 'none'; document.getElementById('2411.18153v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.17990">arXiv:2411.17990</a> <span> [<a href="https://arxiv.org/pdf/2411.17990">pdf</a>, <a href="https://arxiv.org/format/2411.17990">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Beam Switching Based Beam Design for High-Speed Train mmWave Communications </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Huang%2C+J">Jingjia Huang</a>, <a href="/search/eess?searchtype=author&query=Qi%2C+C">Chenhao Qi</a>, <a href="/search/eess?searchtype=author&query=Dobre%2C+O+A">Octavia A. Dobre</a>, <a href="/search/eess?searchtype=author&query=Li%2C+G+Y">Geoffrey Ye Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.17990v1-abstract-short" style="display: inline;"> For high-speed train (HST) millimeter wave (mmWave) communications, the use of narrow beams with small beam coverage needs frequent beam switching, while wider beams with small beam gain leads to weaker mmWave signal strength. In this paper, we consider beam switching based beam design, which is formulated as an optimization problem aiming to minimize the number of switched beams within a predeter… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17990v1-abstract-full').style.display = 'inline'; document.getElementById('2411.17990v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.17990v1-abstract-full" style="display: none;"> For high-speed train (HST) millimeter wave (mmWave) communications, the use of narrow beams with small beam coverage needs frequent beam switching, while wider beams with small beam gain leads to weaker mmWave signal strength. In this paper, we consider beam switching based beam design, which is formulated as an optimization problem aiming to minimize the number of switched beams within a predetermined railway range subject to that the receiving signal-to-noise ratio (RSNR) at the HST is no lower than a predetermined threshold. To solve this problem, we propose two sequential beam design schemes, both including two alternately-performed stages. In the first stage, given an updated beam coverage according to the railway range, we transform the problem into a feasibility problem and further convert it into a min-max optimization problem by relaxing the RSNR constraints into a penalty of the objective function. In the second stage, we evaluate the feasibility of the beamformer obtained from solving the min-max problem and determine the beam coverage accordingly. Simulation results show that compared to the first scheme, the second scheme can achieve 96.20\% reduction in computational complexity at the cost of only 0.0657\% performance degradation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17990v1-abstract-full').style.display = 'none'; document.getElementById('2411.17990v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.17139">arXiv:2411.17139</a> <span> [<a href="https://arxiv.org/pdf/2411.17139">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Neural-Network-Enhanced Metalens Camera for High-Definition, Dynamic Imaging in the Long-Wave Infrared Spectrum </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wei%2C+J">Jing-Yang Wei</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+H">Hao Huang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+X">Xin Zhang</a>, <a href="/search/eess?searchtype=author&query=Ye%2C+D">De-Mao Ye</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yi Li</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+L">Le Wang</a>, <a href="/search/eess?searchtype=author&query=Ma%2C+Y">Yao-Guang Ma</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yang-Hui Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.17139v1-abstract-short" style="display: inline;"> To provide a lightweight and cost-effective solution for the long-wave infrared imaging using a singlet, we develop a camera by integrating a High-Frequency-Enhancing Cycle-GAN neural network into a metalens imaging system. The High-Frequency-Enhancing Cycle-GAN improves the quality of the original metalens images by addressing inherent frequency loss introduced by the metalens. In addition to the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17139v1-abstract-full').style.display = 'inline'; document.getElementById('2411.17139v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.17139v1-abstract-full" style="display: none;"> To provide a lightweight and cost-effective solution for the long-wave infrared imaging using a singlet, we develop a camera by integrating a High-Frequency-Enhancing Cycle-GAN neural network into a metalens imaging system. The High-Frequency-Enhancing Cycle-GAN improves the quality of the original metalens images by addressing inherent frequency loss introduced by the metalens. In addition to the bidirectional cyclic generative adversarial network, it incorporates a high-frequency adversarial learning module. This module utilizes wavelet transform to extract high-frequency components, and then establishes a high-frequency feedback loop. It enables the generator to enhance the camera outputs by integrating adversarial feedback from the high-frequency discriminator. This ensures that the generator adheres to the constraints imposed by the high-frequency adversarial loss, thereby effectively recovering the camera's frequency loss. This recovery guarantees high-fidelity image output from the camera, facilitating smooth video production. Our camera is capable of achieving dynamic imaging at 125 frames per second with an End Point Error value of 12.58. We also achieve 0.42 for Fr茅chet Inception Distance, 30.62 for Peak Signal to Noise Ratio, and 0.69 for Structural Similarity in the recorded videos. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17139v1-abstract-full').style.display = 'none'; document.getElementById('2411.17139v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15921">arXiv:2411.15921</a> <span> [<a href="https://arxiv.org/pdf/2411.15921">pdf</a>, <a href="https://arxiv.org/format/2411.15921">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> A Tunable Despeckling Neural Network Stabilized via Diffusion Equation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ran%2C+Y">Yi Ran</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+Z">Zhichang Guo</a>, <a href="/search/eess?searchtype=author&query=Li%2C+J">Jia Li</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yao Li</a>, <a href="/search/eess?searchtype=author&query=Burger%2C+M">Martin Burger</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+B">Boying Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15921v1-abstract-short" style="display: inline;"> Multiplicative Gamma noise remove is a critical research area in the application of synthetic aperture radar (SAR) imaging, where neural networks serve as a potent tool. However, real-world data often diverges from theoretical models, exhibiting various disturbances, which makes the neural network less effective. Adversarial attacks work by finding perturbations that significantly disrupt function… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15921v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15921v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15921v1-abstract-full" style="display: none;"> Multiplicative Gamma noise remove is a critical research area in the application of synthetic aperture radar (SAR) imaging, where neural networks serve as a potent tool. However, real-world data often diverges from theoretical models, exhibiting various disturbances, which makes the neural network less effective. Adversarial attacks work by finding perturbations that significantly disrupt functionality of neural networks, as the inherent instability of neural networks makes them highly susceptible. A network designed to withstand such extreme cases can more effectively mitigate general disturbances in real SAR data. In this work, the dissipative nature of diffusion equations is employed to underpin a novel approach for countering adversarial attacks and improve the resistance of real noise disturbance. We propose a tunable, regularized neural network that unrolls a denoising unit and a regularization unit into a single network for end-to-end training. In the network, the denoising unit and the regularization unit are composed of the denoising network and the simplest linear diffusion equation respectively. The regularization unit enhances network stability, allowing post-training time step adjustments to effectively mitigate the adverse impacts of adversarial attacks. The stability and convergence of our model are theoretically proven, and in the experiments, we compare our model with several state-of-the-art denoising methods on simulated images, adversarial samples, and real SAR images, yielding superior results in both quantitative and visual evaluations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15921v1-abstract-full').style.display = 'none'; document.getElementById('2411.15921v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15269">arXiv:2411.15269</a> <span> [<a href="https://arxiv.org/pdf/2411.15269">pdf</a>, <a href="https://arxiv.org/format/2411.15269">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> MambaIRv2: Attentive State Space Restoration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Guo%2C+H">Hang Guo</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+Y">Yong Guo</a>, <a href="/search/eess?searchtype=author&query=Zha%2C+Y">Yaohua Zha</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Y">Yulun Zhang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+W">Wenbo Li</a>, <a href="/search/eess?searchtype=author&query=Dai%2C+T">Tao Dai</a>, <a href="/search/eess?searchtype=author&query=Xia%2C+S">Shu-Tao Xia</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yawei Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15269v1-abstract-short" style="display: inline;"> The Mamba-based image restoration backbones have recently demonstrated significant potential in balancing global reception and computational efficiency. However, the inherent causal modeling limitation of Mamba, where each token depends solely on its predecessors in the scanned sequence, restricts the full utilization of pixels across the image and thus presents new challenges in image restoration… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15269v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15269v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15269v1-abstract-full" style="display: none;"> The Mamba-based image restoration backbones have recently demonstrated significant potential in balancing global reception and computational efficiency. However, the inherent causal modeling limitation of Mamba, where each token depends solely on its predecessors in the scanned sequence, restricts the full utilization of pixels across the image and thus presents new challenges in image restoration. In this work, we propose MambaIRv2, which equips Mamba with the non-causal modeling ability similar to ViTs to reach the attentive state space restoration model. Specifically, the proposed attentive state-space equation allows to attend beyond the scanned sequence and facilitate image unfolding with just one single scan. Moreover, we further introduce a semantic-guided neighboring mechanism to encourage interaction between distant but similar pixels. Extensive experiments show our MambaIRv2 outperforms SRFormer by \textbf{even 0.35dB} PSNR for lightweight SR even with \textbf{9.3\% less} parameters and suppresses HAT on classic SR by \textbf{up to 0.29dB}. Code is available at \url{https://github.com/csguoh/MambaIR}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15269v1-abstract-full').style.display = 'none'; document.getElementById('2411.15269v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Technical report</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14842">arXiv:2411.14842</a> <span> [<a href="https://arxiv.org/pdf/2411.14842">pdf</a>, <a href="https://arxiv.org/format/2411.14842">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Who Can Withstand Chat-Audio Attacks? An Evaluation Benchmark for Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yang%2C+W">Wanqi Yang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yanda Li</a>, <a href="/search/eess?searchtype=author&query=Fang%2C+M">Meng Fang</a>, <a href="/search/eess?searchtype=author&query=Wei%2C+Y">Yunchao Wei</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+T">Tianyi Zhou</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+L">Ling Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14842v1-abstract-short" style="display: inline;"> Adversarial audio attacks pose a significant threat to the growing use of large language models (LLMs) in voice-based human-machine interactions. While existing research has primarily focused on model-specific adversarial methods, real-world applications demand a more generalizable and universal approach to audio adversarial attacks. In this paper, we introduce the Chat-Audio Attacks (CAA) benchma… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14842v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14842v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14842v1-abstract-full" style="display: none;"> Adversarial audio attacks pose a significant threat to the growing use of large language models (LLMs) in voice-based human-machine interactions. While existing research has primarily focused on model-specific adversarial methods, real-world applications demand a more generalizable and universal approach to audio adversarial attacks. In this paper, we introduce the Chat-Audio Attacks (CAA) benchmark including four distinct types of audio attacks, which aims to explore the the vulnerabilities of LLMs to these audio attacks in conversational scenarios. To evaluate the robustness of LLMs, we propose three evaluation strategies: Standard Evaluation, utilizing traditional metrics to quantify model performance under attacks; GPT-4o-Based Evaluation, which simulates real-world conversational complexities; and Human Evaluation, offering insights into user perception and trust. We evaluate six state-of-the-art LLMs with voice interaction capabilities, including Gemini-1.5-Pro, GPT-4o, and others, using three distinct evaluation methods on the CAA benchmark. Our comprehensive analysis reveals the impact of four types of audio attacks on the performance of these models, demonstrating that GPT-4o exhibits the highest level of resilience. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14842v1-abstract-full').style.display = 'none'; document.getElementById('2411.14842v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14525">arXiv:2411.14525</a> <span> [<a href="https://arxiv.org/pdf/2411.14525">pdf</a>, <a href="https://arxiv.org/format/2411.14525">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SegBook: A Simple Baseline and Cookbook for Volumetric Medical Image Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ye%2C+J">Jin Ye</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Ying Chen</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yanjun Li</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+H">Haoyu Wang</a>, <a href="/search/eess?searchtype=author&query=Deng%2C+Z">Zhongying Deng</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+Z">Ziyan Huang</a>, <a href="/search/eess?searchtype=author&query=Su%2C+Y">Yanzhou Su</a>, <a href="/search/eess?searchtype=author&query=Ma%2C+C">Chenglong Ma</a>, <a href="/search/eess?searchtype=author&query=Ji%2C+Y">Yuanfeng Ji</a>, <a href="/search/eess?searchtype=author&query=He%2C+J">Junjun He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14525v1-abstract-short" style="display: inline;"> Computed Tomography (CT) is one of the most popular modalities for medical imaging. By far, CT images have contributed to the largest publicly available datasets for volumetric medical segmentation tasks, covering full-body anatomical structures. Large amounts of full-body CT images provide the opportunity to pre-train powerful models, e.g., STU-Net pre-trained in a supervised fashion, to segment… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14525v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14525v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14525v1-abstract-full" style="display: none;"> Computed Tomography (CT) is one of the most popular modalities for medical imaging. By far, CT images have contributed to the largest publicly available datasets for volumetric medical segmentation tasks, covering full-body anatomical structures. Large amounts of full-body CT images provide the opportunity to pre-train powerful models, e.g., STU-Net pre-trained in a supervised fashion, to segment numerous anatomical structures. However, it remains unclear in which conditions these pre-trained models can be transferred to various downstream medical segmentation tasks, particularly segmenting the other modalities and diverse targets. To address this problem, a large-scale benchmark for comprehensive evaluation is crucial for finding these conditions. Thus, we collected 87 public datasets varying in modality, target, and sample size to evaluate the transfer ability of full-body CT pre-trained models. We then employed a representative model, STU-Net with multiple model scales, to conduct transfer learning across modalities and targets. Our experimental results show that (1) there may be a bottleneck effect concerning the dataset size in fine-tuning, with more improvement on both small- and large-scale datasets than medium-size ones. (2) Models pre-trained on full-body CT demonstrate effective modality transfer, adapting well to other modalities such as MRI. (3) Pre-training on the full-body CT not only supports strong performance in structure detection but also shows efficacy in lesion detection, showcasing adaptability across target tasks. We hope that this large-scale open evaluation of transfer learning can direct future research in volumetric medical image segmentation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14525v1-abstract-full').style.display = 'none'; document.getElementById('2411.14525v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13159">arXiv:2411.13159</a> <span> [<a href="https://arxiv.org/pdf/2411.13159">pdf</a>, <a href="https://arxiv.org/format/2411.13159">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Hard-Synth: Synthesizing Diverse Hard Samples for ASR using Zero-Shot TTS and LLM </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yu%2C+J">Jiawei Yu</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yuang Li</a>, <a href="/search/eess?searchtype=author&query=Qiao%2C+X">Xiaosong Qiao</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+H">Huan Zhao</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+X">Xiaofeng Zhao</a>, <a href="/search/eess?searchtype=author&query=Tang%2C+W">Wei Tang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+M">Min Zhang</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+H">Hao Yang</a>, <a href="/search/eess?searchtype=author&query=Su%2C+J">Jinsong Su</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13159v1-abstract-short" style="display: inline;"> Text-to-speech (TTS) models have been widely adopted to enhance automatic speech recognition (ASR) systems using text-only corpora, thereby reducing the cost of labeling real speech data. Existing research primarily utilizes additional text data and predefined speech styles supported by TTS models. In this paper, we propose Hard-Synth, a novel ASR data augmentation method that leverages large lang… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13159v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13159v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13159v1-abstract-full" style="display: none;"> Text-to-speech (TTS) models have been widely adopted to enhance automatic speech recognition (ASR) systems using text-only corpora, thereby reducing the cost of labeling real speech data. Existing research primarily utilizes additional text data and predefined speech styles supported by TTS models. In this paper, we propose Hard-Synth, a novel ASR data augmentation method that leverages large language models (LLMs) and advanced zero-shot TTS. Our approach employs LLMs to generate diverse in-domain text through rewriting, without relying on additional text data. Rather than using predefined speech styles, we introduce a hard prompt selection method with zero-shot TTS to clone speech styles that the ASR model finds challenging to recognize. Experiments demonstrate that Hard-Synth significantly enhances the Conformer model, achieving relative word error rate (WER) reductions of 6.5\%/4.4\% on LibriSpeech dev/test-other subsets. Additionally, we show that Hard-Synth is data-efficient and capable of reducing bias in ASR. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13159v1-abstract-full').style.display = 'none'; document.getElementById('2411.13159v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11863">arXiv:2411.11863</a> <span> [<a href="https://arxiv.org/pdf/2411.11863">pdf</a>, <a href="https://arxiv.org/ps/2411.11863">ps</a>, <a href="https://arxiv.org/format/2411.11863">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Longitudinal Wrist PPG Analysis for Reliable Hypertension Risk Screening Using Deep Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Lin%2C+H">Hui Lin</a>, <a href="/search/eess?searchtype=author&query=Li%2C+J">Jiyang Li</a>, <a href="/search/eess?searchtype=author&query=Hussein%2C+R">Ramy Hussein</a>, <a href="/search/eess?searchtype=author&query=Sui%2C+X">Xin Sui</a>, <a href="/search/eess?searchtype=author&query=Li%2C+X">Xiaoyu Li</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+G">Guangpu Zhu</a>, <a href="/search/eess?searchtype=author&query=Katsaggelos%2C+A+K">Aggelos K. Katsaggelos</a>, <a href="/search/eess?searchtype=author&query=Zeng%2C+Z">Zijing Zeng</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yelei Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11863v1-abstract-short" style="display: inline;"> Hypertension is a leading risk factor for cardiovascular diseases. Traditional blood pressure monitoring methods are cumbersome and inadequate for continuous tracking, prompting the development of PPG-based cuffless blood pressure monitoring wearables. This study leverages deep learning models, including ResNet and Transformer, to analyze wrist PPG data collected with a smartwatch for efficient hy… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11863v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11863v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11863v1-abstract-full" style="display: none;"> Hypertension is a leading risk factor for cardiovascular diseases. Traditional blood pressure monitoring methods are cumbersome and inadequate for continuous tracking, prompting the development of PPG-based cuffless blood pressure monitoring wearables. This study leverages deep learning models, including ResNet and Transformer, to analyze wrist PPG data collected with a smartwatch for efficient hypertension risk screening, eliminating the need for handcrafted PPG features. Using the Home Blood Pressure Monitoring (HBPM) longitudinal dataset of 448 subjects and five-fold cross-validation, our model was trained on over 68k spot-check instances from 358 subjects and tested on real-world continuous recordings of 90 subjects. The compact ResNet model with 0.124M parameters performed significantly better than traditional machine learning methods, demonstrating its effectiveness in distinguishing between healthy and abnormal cases in real-world scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11863v1-abstract-full').style.display = 'none'; document.getElementById('2411.11863v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">blood pressure, hypertension, cuffless, photoplethysmography, deep learning</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10798">arXiv:2411.10798</a> <span>  </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Unveiling Hidden Details: A RAW Data-Enhanced Paradigm for Real-World Super-Resolution </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Peng%2C+L">Long Peng</a>, <a href="/search/eess?searchtype=author&query=Li%2C+W">Wenbo Li</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+J">Jiaming Guo</a>, <a href="/search/eess?searchtype=author&query=Di%2C+X">Xin Di</a>, <a href="/search/eess?searchtype=author&query=Sun%2C+H">Haoze Sun</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yong Li</a>, <a href="/search/eess?searchtype=author&query=Pei%2C+R">Renjing Pei</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yang Wang</a>, <a href="/search/eess?searchtype=author&query=Cao%2C+Y">Yang Cao</a>, <a href="/search/eess?searchtype=author&query=Zha%2C+Z">Zheng-Jun Zha</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10798v2-abstract-short" style="display: inline;"> Real-world image super-resolution (Real SR) aims to generate high-fidelity, detail-rich high-resolution (HR) images from low-resolution (LR) counterparts. Existing Real SR methods primarily focus on generating details from the LR RGB domain, often leading to a lack of richness or fidelity in fine details. In this paper, we pioneer the use of details hidden in RAW data to complement existing RGB-on… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10798v2-abstract-full').style.display = 'inline'; document.getElementById('2411.10798v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10798v2-abstract-full" style="display: none;"> Real-world image super-resolution (Real SR) aims to generate high-fidelity, detail-rich high-resolution (HR) images from low-resolution (LR) counterparts. Existing Real SR methods primarily focus on generating details from the LR RGB domain, often leading to a lack of richness or fidelity in fine details. In this paper, we pioneer the use of details hidden in RAW data to complement existing RGB-only methods, yielding superior outputs. We argue that key image processing steps in Image Signal Processing, such as denoising and demosaicing, inherently result in the loss of fine details in LR images, making LR RAW a valuable information source. To validate this, we present RealSR-RAW, a comprehensive dataset comprising over 10,000 pairs with LR and HR RGB images, along with corresponding LR RAW, captured across multiple smartphones under varying focal lengths and diverse scenes. Additionally, we propose a novel, general RAW adapter to efficiently integrate LR RAW data into existing CNNs, Transformers, and Diffusion-based Real SR models by suppressing the noise contained in LR RAW and aligning its distribution. Extensive experiments demonstrate that incorporating RAW data significantly enhances detail recovery and improves Real SR performance across ten evaluation metrics, including both fidelity and perception-oriented metrics. Our findings open a new direction for the Real SR task, with the dataset and code will be made available to support future research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10798v2-abstract-full').style.display = 'none'; document.getElementById('2411.10798v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">We sincerely apologize, but due to some commercial confidentiality agreements related to the report, we have decided to withdraw the submission for now and will resubmit after making the necessary revisions</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10775">arXiv:2411.10775</a> <span> [<a href="https://arxiv.org/pdf/2411.10775">pdf</a>, <a href="https://arxiv.org/format/2411.10775">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> Beyond Feature Mapping GAP: Integrating Real HDRTV Priors for Superior SDRTV-to-HDRTV Conversion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Xu%2C+K">Kepeng Xu</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+L">Li Xu</a>, <a href="/search/eess?searchtype=author&query=He%2C+G">Gang He</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Z">Zhiqiang Zhang</a>, <a href="/search/eess?searchtype=author&query=Yu%2C+W">Wenxin Yu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+S">Shihao Wang</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+D">Dajiang Zhou</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yunsong Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10775v1-abstract-short" style="display: inline;"> The rise of HDR-WCG display devices has highlighted the need to convert SDRTV to HDRTV, as most video sources are still in SDR. Existing methods primarily focus on designing neural networks to learn a single-style mapping from SDRTV to HDRTV. However, the limited information in SDRTV and the diversity of styles in real-world conversions render this process an ill-posed problem, thereby constrainin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10775v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10775v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10775v1-abstract-full" style="display: none;"> The rise of HDR-WCG display devices has highlighted the need to convert SDRTV to HDRTV, as most video sources are still in SDR. Existing methods primarily focus on designing neural networks to learn a single-style mapping from SDRTV to HDRTV. However, the limited information in SDRTV and the diversity of styles in real-world conversions render this process an ill-posed problem, thereby constraining the performance and generalization of these methods. Inspired by generative approaches, we propose a novel method for SDRTV to HDRTV conversion guided by real HDRTV priors. Despite the limited information in SDRTV, introducing real HDRTV as reference priors significantly constrains the solution space of the originally high-dimensional ill-posed problem. This shift transforms the task from solving an unreferenced prediction problem to making a referenced selection, thereby markedly enhancing the accuracy and reliability of the conversion process. Specifically, our approach comprises two stages: the first stage employs a Vector Quantized Generative Adversarial Network to capture HDRTV priors, while the second stage matches these priors to the input SDRTV content to recover realistic HDRTV outputs. We evaluate our method on public datasets, demonstrating its effectiveness with significant improvements in both objective and subjective metrics across real and synthetic datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10775v1-abstract-full').style.display = 'none'; document.getElementById('2411.10775v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages,4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10773">arXiv:2411.10773</a> <span> [<a href="https://arxiv.org/pdf/2411.10773">pdf</a>, <a href="https://arxiv.org/format/2411.10773">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> An End-to-End Real-World Camera Imaging Pipeline </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Xu%2C+K">Kepeng Xu</a>, <a href="/search/eess?searchtype=author&query=Ma%2C+Z">Zijia Ma</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+L">Li Xu</a>, <a href="/search/eess?searchtype=author&query=He%2C+G">Gang He</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yunsong Li</a>, <a href="/search/eess?searchtype=author&query=Yu%2C+W">Wenxin Yu</a>, <a href="/search/eess?searchtype=author&query=Han%2C+T">Taichu Han</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+C">Cheng Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10773v1-abstract-short" style="display: inline;"> Recent advances in neural camera imaging pipelines have demonstrated notable progress. Nevertheless, the real-world imaging pipeline still faces challenges including the lack of joint optimization in system components, computational redundancies, and optical distortions such as lens shading.In light of this, we propose an end-to-end camera imaging pipeline (RealCamNet) to enhance real-world camera… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10773v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10773v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10773v1-abstract-full" style="display: none;"> Recent advances in neural camera imaging pipelines have demonstrated notable progress. Nevertheless, the real-world imaging pipeline still faces challenges including the lack of joint optimization in system components, computational redundancies, and optical distortions such as lens shading.In light of this, we propose an end-to-end camera imaging pipeline (RealCamNet) to enhance real-world camera imaging performance. Our methodology diverges from conventional, fragmented multi-stage image signal processing towards end-to-end architecture. This architecture facilitates joint optimization across the full pipeline and the restoration of coordinate-biased distortions. RealCamNet is designed for high-quality conversion from RAW to RGB and compact image compression. Specifically, we deeply analyze coordinate-dependent optical distortions, e.g., vignetting and dark shading, and design a novel Coordinate-Aware Distortion Restoration (CADR) module to restore coordinate-biased distortions. Furthermore, we propose a Coordinate-Independent Mapping Compression (CIMC) module to implement tone mapping and redundant information compression. Existing datasets suffer from misalignment and overly idealized conditions, making them inadequate for training real-world imaging pipelines. Therefore, we collected a real-world imaging dataset. Experiment results show that RealCamNet achieves the best rate-distortion performance with lower inference latency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10773v1-abstract-full').style.display = 'none'; document.getElementById('2411.10773v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accept by ACMMM 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10262">arXiv:2411.10262</a> <span> [<a href="https://arxiv.org/pdf/2411.10262">pdf</a>, <a href="https://arxiv.org/ps/2411.10262">ps</a>, <a href="https://arxiv.org/format/2411.10262">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Observer-Based Safety Monitoring of Nonlinear Dynamical Systems with Neural Networks via Quadratic Constraint Approach </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+T">Tao Wang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yapeng Li</a>, <a href="/search/eess?searchtype=author&query=Mo%2C+Z">Zihao Mo</a>, <a href="/search/eess?searchtype=author&query=Cooke%2C+W">Wesley Cooke</a>, <a href="/search/eess?searchtype=author&query=Xiang%2C+W">Weiming Xiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10262v1-abstract-short" style="display: inline;"> The safety monitoring for nonlinear dynamical systems with embedded neural network components is addressed in this paper. The interval-observer-based safety monitor is developed consisting of two auxiliary neural networks derived from the neural network components of the dynamical system. Due to the presence of nonlinear activation functions in neural networks, we use quadratic constraints on the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10262v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10262v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10262v1-abstract-full" style="display: none;"> The safety monitoring for nonlinear dynamical systems with embedded neural network components is addressed in this paper. The interval-observer-based safety monitor is developed consisting of two auxiliary neural networks derived from the neural network components of the dynamical system. Due to the presence of nonlinear activation functions in neural networks, we use quadratic constraints on the global sector to abstract the nonlinear activation functions in neural networks. By combining a quadratic constraint approach for the activation function with Lyapunov theory, the interval observer design problem is transformed into a series of quadratic and linear programming feasibility problems to make the interval observer operate with the ability to correctly estimate the system state with estimation errors within acceptable limits. The applicability of the proposed method is verified by simulation of the lateral vehicle control system. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10262v1-abstract-full').style.display = 'none'; document.getElementById('2411.10262v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.08144">arXiv:2411.08144</a> <span> [<a href="https://arxiv.org/pdf/2411.08144">pdf</a>, <a href="https://arxiv.org/format/2411.08144">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Visual Tracking with Intermittent Visibility: Switched Control Design and Implementation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yangge Li</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+B+C">Benjamin C Yang</a>, <a href="/search/eess?searchtype=author&query=Mitra%2C+S">Sayan Mitra</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.08144v1-abstract-short" style="display: inline;"> This paper addresses the problem of visual target tracking in scenarios where a pursuer may experience intermittent loss of visibility of the target. The design of a Switched Visual Tracker (SVT) is presented which aims to meet the competing requirements of maintaining both proximity and visibility. SVT alternates between a visual tracking mode for following the target, and a recovery mode for reg… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08144v1-abstract-full').style.display = 'inline'; document.getElementById('2411.08144v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.08144v1-abstract-full" style="display: none;"> This paper addresses the problem of visual target tracking in scenarios where a pursuer may experience intermittent loss of visibility of the target. The design of a Switched Visual Tracker (SVT) is presented which aims to meet the competing requirements of maintaining both proximity and visibility. SVT alternates between a visual tracking mode for following the target, and a recovery mode for regaining visual contact when the target falls out of sight. We establish the stability of SVT by extending the average dwell time theorem from switched systems theory, which may be of independent interest. Our implementation of SVT on an Agilicious drone [1] illustrates its effectiveness on tracking various target trajectories: it reduces the average tracking error by up to 45% and significantly improves visibility duration compared to a baseline algorithm. The results show that our approach effectively handles intermittent vision loss, offering enhanced robustness and adaptability for real-world autonomous missions. Additionally, we demonstrate how the stability analysis provides valuable guidance for selecting parameters, such as tracking speed and recovery distance, to optimize the SVT's performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08144v1-abstract-full').style.display = 'none'; document.getElementById('2411.08144v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06564">arXiv:2411.06564</a> <span> [<a href="https://arxiv.org/pdf/2411.06564">pdf</a>, <a href="https://arxiv.org/format/2411.06564">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Robust Beamforming with Application in High-Resolution Sensing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+S">Shixiong Wang</a>, <a href="/search/eess?searchtype=author&query=Dai%2C+W">Wei Dai</a>, <a href="/search/eess?searchtype=author&query=Li%2C+G+Y">Geoffrey Ye Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06564v1-abstract-short" style="display: inline;"> As a fundamental technique in array signal processing, beamforming plays a crucial role in amplifying signals of interest while mitigating interference and noise. When uncertainties exist in the signal model or the data size of snapshots is limited, the performance of beamformers significantly degrades. In this article, we comprehensively study the conceptual system, theoretical analysis, and algo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06564v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06564v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06564v1-abstract-full" style="display: none;"> As a fundamental technique in array signal processing, beamforming plays a crucial role in amplifying signals of interest while mitigating interference and noise. When uncertainties exist in the signal model or the data size of snapshots is limited, the performance of beamformers significantly degrades. In this article, we comprehensively study the conceptual system, theoretical analysis, and algorithmic design for robust beamforming. Particularly, four technical approaches for robust beamforming are discussed, including locally robust beamforming, globally robust beamforming, regularized beamforming, and Bayesian-nonparametric beamforming. In addition, we investigate the equivalence among the methods and suggest a unified robust beamforming framework. As an application example, we show that the resolution of robust beamformers for direction-of-arrival (DoA) estimation can be greatly refined by incorporating the characteristics of subspace methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06564v1-abstract-full').style.display = 'none'; document.getElementById('2411.06564v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05870">arXiv:2411.05870</a> <span> [<a href="https://arxiv.org/pdf/2411.05870">pdf</a>, <a href="https://arxiv.org/format/2411.05870">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Dynamical Systems">math.DS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Probability">math.PR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Data Analysis, Statistics and Probability">physics.data-an</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Methodology">stat.ME</span> </div> </div> <p class="title is-5 mathjax"> An Adaptive Online Smoother with Closed-Form Solutions and Information-Theoretic Lag Selection for Conditional Gaussian Nonlinear Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Andreou%2C+M">Marios Andreou</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+N">Nan Chen</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yingda Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05870v1-abstract-short" style="display: inline;"> Data assimilation (DA) combines partial observations with a dynamical model to improve state estimation. Filter-based DA uses only past and present data and is the prerequisite for real-time forecasts. Smoother-based DA exploits both past and future observations. It aims to fill in missing data, provide more accurate estimations, and develop high-quality datasets. However, the standard smoothing p… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05870v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05870v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05870v1-abstract-full" style="display: none;"> Data assimilation (DA) combines partial observations with a dynamical model to improve state estimation. Filter-based DA uses only past and present data and is the prerequisite for real-time forecasts. Smoother-based DA exploits both past and future observations. It aims to fill in missing data, provide more accurate estimations, and develop high-quality datasets. However, the standard smoothing procedure requires using all historical state estimations, which is storage-demanding, especially for high-dimensional systems. This paper develops an adaptive-lag online smoother for a large class of complex dynamical systems with strong nonlinear and non-Gaussian features, which has important applications to many real-world problems. The adaptive lag allows the DA to utilize only observations within a nearby window, significantly reducing computational storage. Online lag adjustment is essential for tackling turbulent systems, where temporal autocorrelation varies significantly over time due to intermittency, extreme events, and nonlinearity. Based on the uncertainty reduction in the estimated state, an information criterion is developed to systematically determine the adaptive lag. Notably, the mathematical structure of these systems facilitates the use of closed analytic formulae to calculate the online smoother and the adaptive lag, avoiding empirical tunings as in ensemble-based DA methods. The adaptive online smoother is applied to studying three important scientific problems. First, it helps detect online causal relationships between state variables. Second, its advantage of computational storage is illustrated via Lagrangian DA, a high-dimensional nonlinear problem. Finally, the adaptive smoother advances online parameter estimation with partial observations, emphasizing the role of the observed extreme events in accelerating convergence. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05870v1-abstract-full').style.display = 'none'; document.getElementById('2411.05870v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">40 pages, 7 figures, typeset in LaTeX. Submitted for peer-review to Springer Nature's Journal of Nonlinear Science. For more info see https://sites.google.com/wisc.edu/mariosandreou/pubs-and-talks/cgns-online-martingale-free#h.55a05qfs9w12</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 60H10; 62M20; 93E14 (Primary) 62F15 (Secondary) </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05492">arXiv:2411.05492</a> <span> [<a href="https://arxiv.org/pdf/2411.05492">pdf</a>, <a href="https://arxiv.org/ps/2411.05492">ps</a>, <a href="https://arxiv.org/format/2411.05492">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> </div> </div> <p class="title is-5 mathjax"> Covariance-Based Device Activity Detection with Massive MIMO for Near-Field Correlated Channels </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+Z">Ziyue Wang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yang Li</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+Y">Ya-Feng Liu</a>, <a href="/search/eess?searchtype=author&query=Ma%2C+J">Junjie Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05492v1-abstract-short" style="display: inline;"> This paper studies the device activity detection problem in a massive multiple-input multiple-output (MIMO) system for near-field communications (NFC). In this system, active devices transmit their signature sequences to the base station (BS), which detects the active devices based on the received signal. In this paper, we model the near-field channels as correlated Rician fading channels and form… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05492v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05492v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05492v1-abstract-full" style="display: none;"> This paper studies the device activity detection problem in a massive multiple-input multiple-output (MIMO) system for near-field communications (NFC). In this system, active devices transmit their signature sequences to the base station (BS), which detects the active devices based on the received signal. In this paper, we model the near-field channels as correlated Rician fading channels and formulate the device activity detection problem as a maximum likelihood estimation (MLE) problem. Compared to the traditional uncorrelated channel model, the correlation of channels complicates both algorithm design and theoretical analysis of the MLE problem. On the algorithmic side, we propose two computationally efficient algorithms for solving the MLE problem: an exact coordinate descent (CD) algorithm and an inexact CD algorithm. The exact CD algorithm solves the one-dimensional optimization subproblem exactly using matrix eigenvalue decomposition and polynomial root-finding. By approximating the objective function appropriately, the inexact CD algorithm solves the one-dimensional optimization subproblem inexactly with lower complexity and more robust numerical performance. Additionally, we analyze the detection performance of the MLE problem under correlated channels by comparing it with the case of uncorrelated channels. The analysis shows that when the overall number of devices $N$ is large or the signature sequence length $L$ is small, the detection performance of MLE under correlated channels tends to be better than that under uncorrelated channels. Conversely, when $N$ is small or $L$ is large, MLE performs better under uncorrelated channels than under correlated ones. Simulation results demonstrate the computational efficiency of the proposed algorithms and verify the correctness of the analysis. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05492v1-abstract-full').style.display = 'none'; document.getElementById('2411.05492v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages, 8 figures, submitted for possible publication</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04153">arXiv:2411.04153</a> <span> [<a href="https://arxiv.org/pdf/2411.04153">pdf</a>, <a href="https://arxiv.org/format/2411.04153">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Urban Flood Mapping Using Satellite Synthetic Aperture Radar Data: A Review of Characteristics, Approaches and Datasets </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhao%2C+J">Jie Zhao</a>, <a href="/search/eess?searchtype=author&query=Li%2C+M">Ming Li</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yu Li</a>, <a href="/search/eess?searchtype=author&query=Matgen%2C+P">Patrick Matgen</a>, <a href="/search/eess?searchtype=author&query=Chini%2C+M">Marco Chini</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04153v1-abstract-short" style="display: inline;"> Understanding the extent of urban flooding is crucial for assessing building damage, casualties and economic losses. Synthetic Aperture Radar (SAR) technology offers significant advantages for mapping flooded urban areas due to its ability to collect data regardless weather and solar illumination conditions. However, the wide range of existing methods makes it difficult to choose the best approach… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04153v1-abstract-full').style.display = 'inline'; document.getElementById('2411.04153v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04153v1-abstract-full" style="display: none;"> Understanding the extent of urban flooding is crucial for assessing building damage, casualties and economic losses. Synthetic Aperture Radar (SAR) technology offers significant advantages for mapping flooded urban areas due to its ability to collect data regardless weather and solar illumination conditions. However, the wide range of existing methods makes it difficult to choose the best approach for a specific situation and to identify future research directions. Therefore, this study provides a comprehensive review of current research on urban flood mapping using SAR data, summarizing key characteristics of floodwater in SAR images and outlining various approaches from scientific articles. Additionally, we provide a brief overview of the advantages and disadvantages of each method category, along with guidance on selecting the most suitable approach for different scenarios. This study focuses on the challenges and advancements in SAR-based urban flood mapping. It specifically addresses the limitations of spatial and temporal resolution in SAR data and discusses the essential pre-processing steps. Moreover, the article explores the potential benefits of Polarimetric SAR (PolSAR) techniques and uncertainty analysis for future research. Furthermore, it highlights a lack of open-access SAR datasets for urban flood mapping, hindering development in advanced deep learning-based methods. Besides, we evaluated the Technology Readiness Levels (TRLs) of urban flood mapping techniques to identify challenges and future research areas. Finally, the study explores the practical applications of SAR-based urban flood mapping in both the private and public sectors and provides a comprehensive overview of the benefits and potential impact of these methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04153v1-abstract-full').style.display = 'none'; document.getElementById('2411.04153v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by IEEE Geoscience and Remote Sensing Magazine</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01991">arXiv:2411.01991</a> <span> [<a href="https://arxiv.org/pdf/2411.01991">pdf</a>, <a href="https://arxiv.org/format/2411.01991">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Multimodal Trustworthy Semantic Communication for Audio-Visual Event Localization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yuandi Li</a>, <a href="/search/eess?searchtype=author&query=Xiang%2C+Z">Zhe Xiang</a>, <a href="/search/eess?searchtype=author&query=Yu%2C+F">Fei Yu</a>, <a href="/search/eess?searchtype=author&query=Guan%2C+Z">Zhangshuang Guan</a>, <a href="/search/eess?searchtype=author&query=Ji%2C+H">Hui Ji</a>, <a href="/search/eess?searchtype=author&query=Wan%2C+Z">Zhiguo Wan</a>, <a href="/search/eess?searchtype=author&query=Feng%2C+C">Cheng Feng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01991v1-abstract-short" style="display: inline;"> The exponential growth in wireless data traffic, driven by the proliferation of mobile devices and smart applications, poses significant challenges for modern communication systems. Ensuring the secure and reliable transmission of multimodal semantic information is increasingly critical, particularly for tasks like Audio-Visual Event (AVE) localization. This letter introduces MMTrustSC, a novel fr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01991v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01991v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01991v1-abstract-full" style="display: none;"> The exponential growth in wireless data traffic, driven by the proliferation of mobile devices and smart applications, poses significant challenges for modern communication systems. Ensuring the secure and reliable transmission of multimodal semantic information is increasingly critical, particularly for tasks like Audio-Visual Event (AVE) localization. This letter introduces MMTrustSC, a novel framework designed to address these challenges by enhancing the security and reliability of multimodal communication. MMTrustSC incorporates advanced semantic encoding techniques to safeguard data integrity and privacy. It features a two-level coding scheme that combines error-correcting codes with conventional encoders to improve the accuracy and reliability of multimodal data transmission. Additionally, MMTrustSC employs hybrid encryption, integrating both asymmetric and symmetric encryption methods, to secure semantic information and ensure its confidentiality and integrity across potentially hostile networks. Simulation results validate MMTrustSC's effectiveness, demonstrating substantial improvements in data transmission accuracy and reliability for AVE localization tasks. This framework represents a significant advancement in managing intermodal information complementarity and mitigating physical noise, thus enhancing overall system performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01991v1-abstract-full').style.display = 'none'; document.getElementById('2411.01991v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01918">arXiv:2411.01918</a> <span> [<a href="https://arxiv.org/pdf/2411.01918">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Preemptive Holistic Collaborative System and Its Application in Road Transportation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Peng%2C+T">Ting Peng</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yuan Li</a>, <a href="/search/eess?searchtype=author&query=Li%2C+T">Tao Li</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+X">Xiaoxue Xu</a>, <a href="/search/eess?searchtype=author&query=Dong%2C+X">Xiang Dong</a>, <a href="/search/eess?searchtype=author&query=Cai%2C+Y">Yincai Cai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01918v1-abstract-short" style="display: inline;"> Numerous real-world systems, including manufacturing processes, supply chains, and robotic systems, involve multiple independent entities with diverse objectives. The potential for conflicts arises from the inability of these entities to accurately predict and anticipate each other's actions. To address this challenge, we propose the Preemptive Holistic Collaborative System (PHCS) framework. By en… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01918v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01918v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01918v1-abstract-full" style="display: none;"> Numerous real-world systems, including manufacturing processes, supply chains, and robotic systems, involve multiple independent entities with diverse objectives. The potential for conflicts arises from the inability of these entities to accurately predict and anticipate each other's actions. To address this challenge, we propose the Preemptive Holistic Collaborative System (PHCS) framework. By enabling information sharing and collaborative planning among independent entities, the PHCS facilitates the preemptive resolution of potential conflicts. We apply the PHCS framework to the specific context of road transportation, resulting in the Preemptive Holistic Collaborative Road Transportation System (PHCRTS). This system leverages shared driving intentions and pre-planned trajectories to optimize traffic flow and enhance safety. Simulation experiments in a two-lane merging scenario demonstrate the effectiveness of PHCRTS, reducing vehicle time delays by 90%, increasing traffic capacity by 300%, and eliminating accidents. The PHCS framework offers a promising approach to optimize the performance and safety of complex systems with multiple independent entities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01918v1-abstract-full').style.display = 'none'; document.getElementById('2411.01918v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01859">arXiv:2411.01859</a> <span> [<a href="https://arxiv.org/pdf/2411.01859">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> A Novel Deep Learning Tractography Fiber Clustering Framework for Functionally Consistent White Matter Parcellation Using Multimodal Diffusion MRI and Functional MRI </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+J">Jin Wang</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+B">Bocheng Guo</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yijie Li</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+J">Junyi Wang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yuqian Chen</a>, <a href="/search/eess?searchtype=author&query=Rushmore%2C+J">Jarrett Rushmore</a>, <a href="/search/eess?searchtype=author&query=Makris%2C+N">Nikos Makris</a>, <a href="/search/eess?searchtype=author&query=Rathi%2C+Y">Yogesh Rathi</a>, <a href="/search/eess?searchtype=author&query=O%27Donnell%2C+L+J">Lauren J O'Donnell</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+F">Fan Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01859v1-abstract-short" style="display: inline;"> Tractography fiber clustering using diffusion MRI (dMRI) is a crucial strategy for white matter (WM) parcellation. Current methods primarily use the geometric information of fibers (i.e., the spatial trajectories) to group similar fibers into clusters, overlooking the important functional signals present along the fiber tracts. There is increasing evidence that neural activity in the WM can be mea… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01859v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01859v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01859v1-abstract-full" style="display: none;"> Tractography fiber clustering using diffusion MRI (dMRI) is a crucial strategy for white matter (WM) parcellation. Current methods primarily use the geometric information of fibers (i.e., the spatial trajectories) to group similar fibers into clusters, overlooking the important functional signals present along the fiber tracts. There is increasing evidence that neural activity in the WM can be measured using functional MRI (fMRI), offering potentially valuable multimodal information for fiber clustering. In this paper, we develop a novel deep learning fiber clustering framework, namely Deep Multi-view Fiber Clustering (DMVFC), that uses joint dMRI and fMRI data to enable functionally consistent WM parcellation. DMVFC can effectively integrate the geometric characteristics of the WM fibers with the fMRI BOLD signals along the fiber tracts. It includes two major components: 1) a multi-view pretraining module to compute embedding features from fiber geometric information and functional signals separately, and 2) a collaborative fine-tuning module to simultaneously refine the two kinds of embeddings. In the experiments, we compare DMVFC with two state-of-the-art fiber clustering methods and demonstrate superior performance in achieving functionally meaningful and consistent WM parcellation results. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01859v1-abstract-full').style.display = 'none'; document.getElementById('2411.01859v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 3 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00911">arXiv:2411.00911</a> <span> [<a href="https://arxiv.org/pdf/2411.00911">pdf</a>, <a href="https://arxiv.org/format/2411.00911">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Geophysics">physics.geo-ph</span> </div> </div> <p class="title is-5 mathjax"> Zero-Shot Self-Consistency Learning for Seismic Irregular Spatial Sampling Reconstruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Peng%2C+J">Junheng Peng</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+Y">Yingtian Liu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+M">Mingwei Wang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yong Li</a>, <a href="/search/eess?searchtype=author&query=Li%2C+H">Huating Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00911v1-abstract-short" style="display: inline;"> Seismic exploration is currently the most important method for understanding subsurface structures. However, due to surface conditions, seismic receivers may not be uniformly distributed along the measurement line, making the entire exploration work difficult to carry out. Previous deep learning methods for reconstructing seismic data often relied on additional datasets for training. While some ex… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00911v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00911v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00911v1-abstract-full" style="display: none;"> Seismic exploration is currently the most important method for understanding subsurface structures. However, due to surface conditions, seismic receivers may not be uniformly distributed along the measurement line, making the entire exploration work difficult to carry out. Previous deep learning methods for reconstructing seismic data often relied on additional datasets for training. While some existing methods do not require extra data, they lack constraints on the reconstruction data, leading to unstable reconstruction performance. In this paper, we proposed a zero-shot self-consistency learning strategy and employed an extremely lightweight network for seismic data reconstruction. Our method does not require additional datasets and utilizes the correlations among different parts of the data to design a self-consistency learning loss function, driving a network with only 90,609 learnable parameters. We applied this method to experiments on the USGS National Petroleum Reserve-Alaska public dataset and the results indicate that our proposed approach achieved good reconstruction results. Additionally, our method also demonstrates a certain degree of noise suppression, which is highly beneficial for large and complex seismic exploration tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00911v1-abstract-full').style.display = 'none'; document.getElementById('2411.00911v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 8 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68T07 <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.4.5 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00813">arXiv:2411.00813</a> <span> [<a href="https://arxiv.org/pdf/2411.00813">pdf</a>, <a href="https://arxiv.org/format/2411.00813">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Personality Analysis from Online Short Video Platforms with Multi-domain Adaptation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=An%2C+S">Sixu An</a>, <a href="/search/eess?searchtype=author&query=Sun%2C+X">Xiangguo Sun</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yicong Li</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+Y">Yu Yang</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+G">Guandong Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00813v1-abstract-short" style="display: inline;"> Personality analysis from online short videos has gained prominence due to its applications in personalized recommendation systems, sentiment analysis, and human-computer interaction. Traditional assessment methods, such as questionnaires based on the Big Five Personality Framework, are limited by self-report biases and are impractical for large-scale or real-time analysis. Leveraging the rich, mu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00813v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00813v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00813v1-abstract-full" style="display: none;"> Personality analysis from online short videos has gained prominence due to its applications in personalized recommendation systems, sentiment analysis, and human-computer interaction. Traditional assessment methods, such as questionnaires based on the Big Five Personality Framework, are limited by self-report biases and are impractical for large-scale or real-time analysis. Leveraging the rich, multi-modal data present in short videos offers a promising alternative for more accurate personality inference. However, integrating these diverse and asynchronous modalities poses significant challenges, particularly in aligning time-varying data and ensuring models generalize well to new domains with limited labeled data. In this paper, we propose a novel multi-modal personality analysis framework that addresses these challenges by synchronizing and integrating features from multiple modalities and enhancing model generalization through domain adaptation. We introduce a timestamp-based modality alignment mechanism that synchronizes data based on spoken word timestamps, ensuring accurate correspondence across modalities and facilitating effective feature integration. To capture temporal dependencies and inter-modal interactions, we employ Bidirectional Long Short-Term Memory networks and self-attention mechanisms, allowing the model to focus on the most informative features for personality prediction. Furthermore, we develop a gradient-based domain adaptation method that transfers knowledge from multiple source domains to improve performance in target domains with scarce labeled data. Extensive experiments on real-world datasets demonstrate that our framework significantly outperforms existing methods in personality prediction tasks, highlighting its effectiveness in capturing complex behavioral cues and robustness in adapting to new domains. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00813v1-abstract-full').style.display = 'none'; document.getElementById('2411.00813v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00774">arXiv:2411.00774</a> <span> [<a href="https://arxiv.org/pdf/2411.00774">pdf</a>, <a href="https://arxiv.org/format/2411.00774">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Freeze-Omni: A Smart and Low Latency Speech-to-speech Dialogue Model with Frozen LLM </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+X">Xiong Wang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yangze Li</a>, <a href="/search/eess?searchtype=author&query=Fu%2C+C">Chaoyou Fu</a>, <a href="/search/eess?searchtype=author&query=Shen%2C+Y">Yunhang Shen</a>, <a href="/search/eess?searchtype=author&query=Xie%2C+L">Lei Xie</a>, <a href="/search/eess?searchtype=author&query=Li%2C+K">Ke Li</a>, <a href="/search/eess?searchtype=author&query=Sun%2C+X">Xing Sun</a>, <a href="/search/eess?searchtype=author&query=Ma%2C+L">Long Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00774v5-abstract-short" style="display: inline;"> Rapidly developing large language models (LLMs) have brought tremendous intelligent applications. Especially, the GPT-4o's excellent duplex speech interaction ability has brought impressive experience to users. Researchers have recently proposed several multi-modal LLMs in this direction that can achieve user-agent speech-to-speech conversations. This paper proposes a novel speech-text multimodal… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00774v5-abstract-full').style.display = 'inline'; document.getElementById('2411.00774v5-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00774v5-abstract-full" style="display: none;"> Rapidly developing large language models (LLMs) have brought tremendous intelligent applications. Especially, the GPT-4o's excellent duplex speech interaction ability has brought impressive experience to users. Researchers have recently proposed several multi-modal LLMs in this direction that can achieve user-agent speech-to-speech conversations. This paper proposes a novel speech-text multimodal LLM architecture called Freeze-Omni. Our main contribution is that the speech input and output modalities can be easily connected to a textual LLM while keeping the LLM's parameters frozen throughout the training process. We design a three-stage training strategy for modeling both the speech input and output, enabling Freeze-Omni to obtain speech-to-speech conversation ability using text-speech paired data (such as ASR and TTS data) and only 60,000 multi-round text Q&A data on 8 GPUs. Moreover, we can effectively ensure that the intelligence of the Freeze-Omni in the speech modality is at the same level compared with that in the text modality of its backbone LLM, while achieving low latency end-to-end spoken response. In addition, we also designed a method to achieve duplex dialogue ability through multi-task training, giving Freeze-Omni a more natural style of dialogue ability between users and agents. In summary, Freeze-Omni holds great potential to conduct speech-to-speech dialogue based on a multimodal LLM under the condition of a frozen LLM, avoiding the catastrophic forgetting problem caused by limited data and training resources. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00774v5-abstract-full').style.display = 'none'; document.getElementById('2411.00774v5-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Page: https://freeze-omni.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00656">arXiv:2411.00656</a> <span> [<a href="https://arxiv.org/pdf/2411.00656">pdf</a>, <a href="https://arxiv.org/format/2411.00656">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Identification of Analytic Nonlinear Dynamical Systems with Non-asymptotic Guarantees </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Musavi%2C+N">Negin Musavi</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+Z">Ziyao Guo</a>, <a href="/search/eess?searchtype=author&query=Dullerud%2C+G">Geir Dullerud</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yingying Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00656v2-abstract-short" style="display: inline;"> This paper focuses on the system identification of an important class of nonlinear systems: linearly parameterized nonlinear systems, which enjoys wide applications in robotics and other mechanical systems. We consider two system identification methods: least-squares estimation (LSE), which is a point estimation method; and set-membership estimation (SME), which estimates an uncertainty set that c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00656v2-abstract-full').style.display = 'inline'; document.getElementById('2411.00656v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00656v2-abstract-full" style="display: none;"> This paper focuses on the system identification of an important class of nonlinear systems: linearly parameterized nonlinear systems, which enjoys wide applications in robotics and other mechanical systems. We consider two system identification methods: least-squares estimation (LSE), which is a point estimation method; and set-membership estimation (SME), which estimates an uncertainty set that contains the true parameters. We provide non-asymptotic convergence rates for LSE and SME under i.i.d. control inputs and control policies with i.i.d. random perturbations, both of which are considered as non-active-exploration inputs. Compared with the counter-example based on piecewise-affine systems in the literature, the success of non-active exploration in our setting relies on a key assumption on the system dynamics: we require the system functions to be real-analytic. Our results, together with the piecewise-affine counter-example, reveal the importance of differentiability in nonlinear system identification through non-active exploration. Lastly, we numerically compare our theoretical bounds with the empirical performance of LSE and SME on a pendulum example and a quadrotor example. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00656v2-abstract-full').style.display = 'none'; document.getElementById('2411.00656v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23577">arXiv:2410.23577</a> <span> [<a href="https://arxiv.org/pdf/2410.23577">pdf</a>, <a href="https://arxiv.org/format/2410.23577">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MS-Glance: Bio-Insipred Non-semantic Context Vectors and their Applications in Supervising Image Reconstruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Gao%2C+Z">Ziqi Gao</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+W">Wendi Yang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yujia Li</a>, <a href="/search/eess?searchtype=author&query=Xing%2C+L">Lei Xing</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+S+K">S. Kevin Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23577v2-abstract-short" style="display: inline;"> Non-semantic context information is crucial for visual recognition, as the human visual perception system first uses global statistics to process scenes rapidly before identifying specific objects. However, while semantic information is increasingly incorporated into computer vision tasks such as image reconstruction, non-semantic information, such as global spatial structures, is often overlooked… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23577v2-abstract-full').style.display = 'inline'; document.getElementById('2410.23577v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23577v2-abstract-full" style="display: none;"> Non-semantic context information is crucial for visual recognition, as the human visual perception system first uses global statistics to process scenes rapidly before identifying specific objects. However, while semantic information is increasingly incorporated into computer vision tasks such as image reconstruction, non-semantic information, such as global spatial structures, is often overlooked. To bridge the gap, we propose a biologically informed non-semantic context descriptor, \textbf{MS-Glance}, along with the Glance Index Measure for comparing two images. A Global Glance vector is formulated by randomly retrieving pixels based on a perception-driven rule from an image to form a vector representing non-semantic global context, while a local Glance vector is a flattened local image window, mimicking a zoom-in observation. The Glance Index is defined as the inner product of two standardized sets of Glance vectors. We evaluate the effectiveness of incorporating Glance supervision in two reconstruction tasks: image fitting with implicit neural representation (INR) and undersampled MRI reconstruction. Extensive experimental results show that MS-Glance outperforms existing image restoration losses across both natural and medical images. The code is available at \url{https://github.com/Z7Gao/MSGlance}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23577v2-abstract-full').style.display = 'none'; document.getElementById('2410.23577v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by WACV 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.22646">arXiv:2410.22646</a> <span> [<a href="https://arxiv.org/pdf/2410.22646">pdf</a>, <a href="https://arxiv.org/format/2410.22646">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> SleepNetZero: Zero-Burden Zero-Shot Reliable Sleep Staging With Neural Networks Based on Ballistocardiograms </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+S">Shuzhen Li</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yuxin Chen</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+X">Xuesong Chen</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+R">Ruiyang Gao</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Y">Yupeng Zhang</a>, <a href="/search/eess?searchtype=author&query=Yu%2C+C">Chao Yu</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yunfei Li</a>, <a href="/search/eess?searchtype=author&query=Ye%2C+Z">Ziyi Ye</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+W">Weijun Huang</a>, <a href="/search/eess?searchtype=author&query=Yi%2C+H">Hongliang Yi</a>, <a href="/search/eess?searchtype=author&query=Leng%2C+Y">Yue Leng</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+Y">Yi Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.22646v1-abstract-short" style="display: inline;"> Sleep monitoring plays a crucial role in maintaining good health, with sleep staging serving as an essential metric in the monitoring process. Traditional methods, utilizing medical sensors like EEG and ECG, can be effective but often present challenges such as unnatural user experience, complex deployment, and high costs. Ballistocardiography~(BCG), a type of piezoelectric sensor signal, offers a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22646v1-abstract-full').style.display = 'inline'; document.getElementById('2410.22646v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.22646v1-abstract-full" style="display: none;"> Sleep monitoring plays a crucial role in maintaining good health, with sleep staging serving as an essential metric in the monitoring process. Traditional methods, utilizing medical sensors like EEG and ECG, can be effective but often present challenges such as unnatural user experience, complex deployment, and high costs. Ballistocardiography~(BCG), a type of piezoelectric sensor signal, offers a non-invasive, user-friendly, and easily deployable alternative for long-term home monitoring. However, reliable BCG-based sleep staging is challenging due to the limited sleep monitoring data available for BCG. A restricted training dataset prevents the model from generalization across populations. Additionally, transferring to BCG faces difficulty ensuring model robustness when migrating from other data sources. To address these issues, we introduce SleepNetZero, a zero-shot learning based approach for sleep staging. To tackle the generalization challenge, we propose a series of BCG feature extraction methods that align BCG components with corresponding respiratory, cardiac, and movement channels in PSG. This allows models to be trained on large-scale PSG datasets that are diverse in population. For the migration challenge, we employ data augmentation techniques, significantly enhancing generalizability. We conducted extensive training and testing on large datasets~(12393 records from 9637 different subjects), achieving an accuracy of 0.803 and a Cohen's Kappa of 0.718. ZeroSleepNet was also deployed in real prototype~(monitoring pads) and tested in actual hospital settings~(265 users), demonstrating an accuracy of 0.697 and a Cohen's Kappa of 0.589. To the best of our knowledge, this work represents the first known reliable BCG-based sleep staging effort and marks a significant step towards in-home health monitoring. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22646v1-abstract-full').style.display = 'none'; document.getElementById('2410.22646v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">25 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20992">arXiv:2410.20992</a> <span> [<a href="https://arxiv.org/pdf/2410.20992">pdf</a>, <a href="https://arxiv.org/format/2410.20992">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Enhanced channel estimation for near-field IRS-aided multi-user MIMO system via deep residual network </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yan Wang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yongqiang Li</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+M">Minghao Chen</a>, <a href="/search/eess?searchtype=author&query=Yao%2C+Y">Yu Yao</a>, <a href="/search/eess?searchtype=author&query=Shu%2C+F">Feng Shu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+J">Jiangzhou Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20992v1-abstract-short" style="display: inline;"> In this paper, channel estimation (CE) of intelligent reflecting surface aided near-field (NF) multi-user communication is investigated. Initially, the least square (LS) estimator and minimum mean square error (MMSE) estimator for the estimated channel are designed, and their mean square errors (MSEs) are derived. Subsequently, to fully harness the potential of deep residual networks (DRNs) in den… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20992v1-abstract-full').style.display = 'inline'; document.getElementById('2410.20992v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20992v1-abstract-full" style="display: none;"> In this paper, channel estimation (CE) of intelligent reflecting surface aided near-field (NF) multi-user communication is investigated. Initially, the least square (LS) estimator and minimum mean square error (MMSE) estimator for the estimated channel are designed, and their mean square errors (MSEs) are derived. Subsequently, to fully harness the potential of deep residual networks (DRNs) in denoising, the above CE problem is reconceptualized as a denoising task, and a DRN-driven NF CE (DRN-NFCE) framework is proposed, and the Cram$\acute{e}$r-Rao lower bound (CRLB) is derived to serve as a benchmark for performance evaluation. In addition, to effectively capture and leverage these diverse channel features, a federated learning (FL) based global DRN-NFCE network, namely FL-DRN-NFCE, is constructed through collaborative training and joint optimization of single region DRN-NFCE (SR-DRN-NFCE) networks in different user regions. Here, users are divided into multiple regions. Correspondingly, a user region classifier based on convolutional neural network is designed to achieve the goal of matching datasets from different user regions to the corresponding SR-DRN-NFCE network. Simulation results demonstrate that the proposed FL-DRN-NFCE framework outperforms LS, MMSE, and no residual connections in terms of MSE, and the proposed FL-DRN-NFCE method has higher CE accuracy over the SR-DRN-NFCE method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20992v1-abstract-full').style.display = 'none'; document.getElementById('2410.20992v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20073">arXiv:2410.20073</a> <span> [<a href="https://arxiv.org/pdf/2410.20073">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Medical Physics">physics.med-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optics">physics.optics</span> </div> </div> <p class="title is-5 mathjax"> Super-resolved virtual staining of label-free tissue using diffusion models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+Y">Yijie Zhang</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+L">Luzhe Huang</a>, <a href="/search/eess?searchtype=author&query=Pillar%2C+N">Nir Pillar</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yuzhu Li</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+H">Hanlong Chen</a>, <a href="/search/eess?searchtype=author&query=Ozcan%2C+A">Aydogan Ozcan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20073v1-abstract-short" style="display: inline;"> Virtual staining of tissue offers a powerful tool for transforming label-free microscopy images of unstained tissue into equivalents of histochemically stained samples. This study presents a diffusion model-based super-resolution virtual staining approach utilizing a Brownian bridge process to enhance both the spatial resolution and fidelity of label-free virtual tissue staining, addressing the li… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20073v1-abstract-full').style.display = 'inline'; document.getElementById('2410.20073v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20073v1-abstract-full" style="display: none;"> Virtual staining of tissue offers a powerful tool for transforming label-free microscopy images of unstained tissue into equivalents of histochemically stained samples. This study presents a diffusion model-based super-resolution virtual staining approach utilizing a Brownian bridge process to enhance both the spatial resolution and fidelity of label-free virtual tissue staining, addressing the limitations of traditional deep learning-based methods. Our approach integrates novel sampling techniques into a diffusion model-based image inference process to significantly reduce the variance in the generated virtually stained images, resulting in more stable and accurate outputs. Blindly applied to lower-resolution auto-fluorescence images of label-free human lung tissue samples, the diffusion-based super-resolution virtual staining model consistently outperformed conventional approaches in resolution, structural similarity and perceptual accuracy, successfully achieving a super-resolution factor of 4-5x, increasing the output space-bandwidth product by 16-25-fold compared to the input label-free microscopy images. Diffusion-based super-resolved virtual tissue staining not only improves resolution and image quality but also enhances the reliability of virtual staining without traditional chemical staining, offering significant potential for clinical diagnostics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20073v1-abstract-full').style.display = 'none'; document.getElementById('2410.20073v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">26 Pages, 5 Figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.19765">arXiv:2410.19765</a> <span> [<a href="https://arxiv.org/pdf/2410.19765">pdf</a>, <a href="https://arxiv.org/format/2410.19765">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1007/978-3-031-72117-5_2">10.1007/978-3-031-72117-5_2 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> A New Perspective to Boost Performance Fairness for Medical Federated Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yan%2C+Y">Yunlu Yan</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+L">Lei Zhu</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yuexiang Li</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+X">Xinxing Xu</a>, <a href="/search/eess?searchtype=author&query=Goh%2C+R+S+M">Rick Siow Mong Goh</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+Y">Yong Liu</a>, <a href="/search/eess?searchtype=author&query=Khan%2C+S">Salman Khan</a>, <a href="/search/eess?searchtype=author&query=Feng%2C+C">Chun-Mei Feng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.19765v1-abstract-short" style="display: inline;"> Improving the fairness of federated learning (FL) benefits healthy and sustainable collaboration, especially for medical applications. However, existing fair FL methods ignore the specific characteristics of medical FL applications, i.e., domain shift among the datasets from different hospitals. In this work, we propose Fed-LWR to improve performance fairness from the perspective of feature shift,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19765v1-abstract-full').style.display = 'inline'; document.getElementById('2410.19765v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.19765v1-abstract-full" style="display: none;"> Improving the fairness of federated learning (FL) benefits healthy and sustainable collaboration, especially for medical applications. However, existing fair FL methods ignore the specific characteristics of medical FL applications, i.e., domain shift among the datasets from different hospitals. In this work, we propose Fed-LWR to improve performance fairness from the perspective of feature shift, a key issue influencing the performance of medical FL systems caused by domain shift. Specifically, we dynamically perceive the bias of the global model across all hospitals by estimating the layer-wise difference in feature representations between local and global models. To minimize global divergence, we assign higher weights to hospitals with larger differences. The estimated client weights help us to re-aggregate the local models per layer to obtain a fairer global model. We evaluate our method on two widely used federated medical image segmentation benchmarks. The results demonstrate that our method achieves better and fairer performance compared with several state-of-the-art fair FL methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19765v1-abstract-full').style.display = 'none'; document.getElementById('2410.19765v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 2 Figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> International Conference on Medical Image Computing and Computer-Assisted Intervention 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18103">arXiv:2410.18103</a> <span> [<a href="https://arxiv.org/pdf/2410.18103">pdf</a>, <a href="https://arxiv.org/format/2410.18103">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> A Hybrid Graph Neural Network for Enhanced EEG-Based Depression Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yiye Wang</a>, <a href="/search/eess?searchtype=author&query=Zheng%2C+W">Wenming Zheng</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yang Li</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+H">Hao Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18103v1-abstract-short" style="display: inline;"> Graph neural networks (GNNs) are becoming increasingly popular for EEG-based depression detection. However, previous GNN-based methods fail to sufficiently consider the characteristics of depression, thus limiting their performance. Firstly, studies in neuroscience indicate that depression patients exhibit both common and individualized brain abnormal patterns. Previous GNN-based approaches typica… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18103v1-abstract-full').style.display = 'inline'; document.getElementById('2410.18103v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18103v1-abstract-full" style="display: none;"> Graph neural networks (GNNs) are becoming increasingly popular for EEG-based depression detection. However, previous GNN-based methods fail to sufficiently consider the characteristics of depression, thus limiting their performance. Firstly, studies in neuroscience indicate that depression patients exhibit both common and individualized brain abnormal patterns. Previous GNN-based approaches typically focus either on fixed graph connections to capture common abnormal brain patterns or on adaptive connections to capture individualized patterns, which is inadequate for depression detection. Secondly, brain network exhibits a hierarchical structure, which includes the arrangement from channel-level graph to region-level graph. This hierarchical structure varies among individuals and contains significant information relevant to detecting depression. Nonetheless, previous GNN-based methods overlook these individualized hierarchical information. To address these issues, we propose a Hybrid GNN (HGNN) that merges a Common Graph Neural Network (CGNN) branch utilizing fixed connection and an Individualized Graph Neural Network (IGNN) branch employing adaptive connections. The two branches capture common and individualized depression patterns respectively, complementing each other. Furthermore, we enhance the IGNN branch with a Graph Pooling and Unpooling Module (GPUM) to extract individualized hierarchical information. Extensive experiments on two public datasets show that our model achieves state-of-the-art performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18103v1-abstract-full').style.display = 'none'; document.getElementById('2410.18103v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.17691">arXiv:2410.17691</a> <span> [<a href="https://arxiv.org/pdf/2410.17691">pdf</a>, <a href="https://arxiv.org/format/2410.17691">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neurons and Cognition">q-bio.NC</span> </div> </div> <p class="title is-5 mathjax"> Longitudinal Causal Image Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yujia Li</a>, <a href="/search/eess?searchtype=author&query=Li%2C+H">Han Li</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+a+S+K">ans S. Kevin Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.17691v1-abstract-short" style="display: inline;"> Clinical decision-making relies heavily on causal reasoning and longitudinal analysis. For example, for a patient with Alzheimer's disease (AD), how will the brain grey matter atrophy in a year if intervened on the A-beta level in cerebrospinal fluid? The answer is fundamental to diagnosis and follow-up treatment. However, this kind of inquiry involves counterfactual medical images which can not b… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17691v1-abstract-full').style.display = 'inline'; document.getElementById('2410.17691v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.17691v1-abstract-full" style="display: none;"> Clinical decision-making relies heavily on causal reasoning and longitudinal analysis. For example, for a patient with Alzheimer's disease (AD), how will the brain grey matter atrophy in a year if intervened on the A-beta level in cerebrospinal fluid? The answer is fundamental to diagnosis and follow-up treatment. However, this kind of inquiry involves counterfactual medical images which can not be acquired by instrumental or correlation-based image synthesis models. Yet, such queries require counterfactual medical images, not obtainable through standard image synthesis models. Hence, a causal longitudinal image synthesis (CLIS) method, enabling the synthesis of such images, is highly valuable. However, building a CLIS model confronts three primary yet unmet challenges: mismatched dimensionality between high-dimensional images and low-dimensional tabular variables, inconsistent collection intervals of follow-up data, and inadequate causal modeling capability of existing causal graph methods for image data. In this paper, we established a tabular-visual causal graph (TVCG) for CLIS overcoming these challenges through a novel integration of generative imaging, continuous-time modeling, and structural causal models combined with a neural network. We train our CLIS based on the ADNI dataset and evaluate it on two other AD datasets, which illustrate the outstanding yet controllable quality of the synthesized images and the contributions of synthesized MRI to the characterization of AD progression, substantiating the reliability and utility in clinics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17691v1-abstract-full').style.display = 'none'; document.getElementById('2410.17691v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Li%2C+Y&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Li%2C+Y&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+Y&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+Y&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+Y&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+Y&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository