CINXE.COM
Search | arXiv e-print repository
<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 239 results for author: <span class="mathjax">Zhang, K</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> <div class="content"> <form method="GET" action="/search/eess" aria-role="search"> Searching in archive <strong>eess</strong>. <a href="/search/?searchtype=author&query=Zhang%2C+K">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Zhang, K"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Zhang%2C+K&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Zhang, K"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Zhang%2C+K&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Zhang%2C+K&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+K&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+K&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+K&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+K&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09531">arXiv:2502.09531</a> <span> [<a href="https://arxiv.org/pdf/2502.09531">pdf</a>, <a href="https://arxiv.org/ps/2502.09531">ps</a>, <a href="https://arxiv.org/format/2502.09531">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Data-Enabled Predictive Control for Flexible Spacecraft </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+H">Huanqing Wang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+K">Kaixiang Zhang</a>, <a href="/search/eess?searchtype=author&query=Vahidi-Moghaddam%2C+A">Amin Vahidi-Moghaddam</a>, <a href="/search/eess?searchtype=author&query=An%2C+H">Haowei An</a>, <a href="/search/eess?searchtype=author&query=Li%2C+N">Nan Li</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+D">Daning Huang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Z">Zhaojian Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09531v1-abstract-short" style="display: inline;"> Spacecraft are vital to space exploration and are often equipped with lightweight, flexible appendages to meet strict weight constraints. These appendages pose significant challenges for modeling and control due to their inherent nonlinearity. Data-driven control methods have gained traction to address such challenges. This paper introduces, to the best of the authors' knowledge, the first applica… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09531v1-abstract-full').style.display = 'inline'; document.getElementById('2502.09531v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09531v1-abstract-full" style="display: none;"> Spacecraft are vital to space exploration and are often equipped with lightweight, flexible appendages to meet strict weight constraints. These appendages pose significant challenges for modeling and control due to their inherent nonlinearity. Data-driven control methods have gained traction to address such challenges. This paper introduces, to the best of the authors' knowledge, the first application of the data-enabled predictive control (DeePC) framework to boundary control for flexible spacecraft. Leveraging the fundamental lemma, DeePC constructs a non-parametric model by utilizing recorded past trajectories, eliminating the need for explicit model development. The developed method also incorporates dimension reduction techniques to enhance computational efficiency. Through comprehensive numerical simulations, this study compares the proposed method with Lyapunov-based control, demonstrating superior performance and offering a thorough evaluation of data-driven control for flexible spacecraft. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09531v1-abstract-full').style.display = 'none'; document.getElementById('2502.09531v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.09992">arXiv:2501.09992</a> <span> [<a href="https://arxiv.org/pdf/2501.09992">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> A Novel Modulation Scheme Based on the Kramers--Kronig Relations for Optical IM/DD Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Dong%2C+X">Xiaohe Dong</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+K">Kuokuo Zhang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+J">Jiarui Zhang</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+B">Baoyin Yang</a>, <a href="/search/eess?searchtype=author&query=Sun%2C+C">Caiming Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.09992v2-abstract-short" style="display: inline;"> The ever-growing demand for higher data rates in optical communication systems necessitates the development of advanced modulation formats capable of significantly enhancing system performance. In this work, we propose a novel modulation format derived from the Kramers--Kronig relations. This scheme effectively reduces the complexity of digital filtering and alleviates the demands on the digital-t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.09992v2-abstract-full').style.display = 'inline'; document.getElementById('2501.09992v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.09992v2-abstract-full" style="display: none;"> The ever-growing demand for higher data rates in optical communication systems necessitates the development of advanced modulation formats capable of significantly enhancing system performance. In this work, we propose a novel modulation format derived from the Kramers--Kronig relations. This scheme effectively reduces the complexity of digital filtering and alleviates the demands on the digital-to-analog converter, offering a practical solution for high speed optical communication. The proposed modulation format was rigorously validated through experimental investigations using an optical wireless link. The results demonstrate a notable improvement in bit error rate (BER) performance and receiver sensitivity compared to PAM-4 and CAP-16 modulation schemes, with enhancements of 0.6 dB and 1.5 dB in receiver sensitivity, respectively. These improvements enable higher data transmission rates, positioning the Kramers--Kronig relations-based modulation format as a promising alternative to existing modulation techniques. Its potential to enhance the efficiency and capacity of optical communication systems is clearly evident. Future work will focus on extending its application to more complex scenarios, such as high-speed underwater optical communication systems, where advanced modulation formats are critical for overcoming bandwidth limitations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.09992v2-abstract-full').style.display = 'none'; document.getElementById('2501.09992v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.07120">arXiv:2501.07120</a> <span> [<a href="https://arxiv.org/pdf/2501.07120">pdf</a>, <a href="https://arxiv.org/format/2501.07120">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MSV-Mamba: A Multiscale Vision Mamba Network for Echocardiography Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yang%2C+X">Xiaoxian Yang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Q">Qi Wang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+K">Kaiqi Zhang</a>, <a href="/search/eess?searchtype=author&query=Wei%2C+K">Ke Wei</a>, <a href="/search/eess?searchtype=author&query=Lyu%2C+J">Jun Lyu</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+L">Lingchao Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.07120v1-abstract-short" style="display: inline;"> Ultrasound imaging frequently encounters challenges, such as those related to elevated noise levels, diminished spatiotemporal resolution, and the complexity of anatomical structures. These factors significantly hinder the model's ability to accurately capture and analyze structural relationships and dynamic patterns across various regions of the heart. Mamba, an emerging model, is one of the most… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.07120v1-abstract-full').style.display = 'inline'; document.getElementById('2501.07120v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.07120v1-abstract-full" style="display: none;"> Ultrasound imaging frequently encounters challenges, such as those related to elevated noise levels, diminished spatiotemporal resolution, and the complexity of anatomical structures. These factors significantly hinder the model's ability to accurately capture and analyze structural relationships and dynamic patterns across various regions of the heart. Mamba, an emerging model, is one of the most cutting-edge approaches that is widely applied to diverse vision and language tasks. To this end, this paper introduces a U-shaped deep learning model incorporating a large-window Mamba scale (LMS) module and a hierarchical feature fusion approach for echocardiographic segmentation. First, a cascaded residual block serves as an encoder and is employed to incrementally extract multiscale detailed features. Second, a large-window multiscale mamba module is integrated into the decoder to capture global dependencies across regions and enhance the segmentation capability for complex anatomical structures. Furthermore, our model introduces auxiliary losses at each decoder layer and employs a dual attention mechanism to fuse multilayer features both spatially and across channels. This approach enhances segmentation performance and accuracy in delineating complex anatomical structures. Finally, the experimental results using the EchoNet-Dynamic and CAMUS datasets demonstrate that the model outperforms other methods in terms of both accuracy and robustness. For the segmentation of the left ventricular endocardium (${LV}_{endo}$), the model achieved optimal values of 95.01 and 93.36, respectively, while for the left ventricular epicardium (${LV}_{epi}$), values of 87.35 and 87.80, respectively, were achieved. This represents an improvement ranging between 0.54 and 1.11 compared with the best-performing model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.07120v1-abstract-full').style.display = 'none'; document.getElementById('2501.07120v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.19553">arXiv:2412.19553</a> <span> [<a href="https://arxiv.org/pdf/2412.19553">pdf</a>, <a href="https://arxiv.org/format/2412.19553">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Structural Similarity in Deep Features: Image Quality Assessment Robust to Geometrically Disparate Reference </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+K">Keke Zhang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+W">Weiling Chen</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+T">Tiesong Zhao</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Z">Zhou Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.19553v1-abstract-short" style="display: inline;"> Image Quality Assessment (IQA) with references plays an important role in optimizing and evaluating computer vision tasks. Traditional methods assume that all pixels of the reference and test images are fully aligned. Such Aligned-Reference IQA (AR-IQA) approaches fail to address many real-world problems with various geometric deformations between the two images. Although significant effort has be… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.19553v1-abstract-full').style.display = 'inline'; document.getElementById('2412.19553v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.19553v1-abstract-full" style="display: none;"> Image Quality Assessment (IQA) with references plays an important role in optimizing and evaluating computer vision tasks. Traditional methods assume that all pixels of the reference and test images are fully aligned. Such Aligned-Reference IQA (AR-IQA) approaches fail to address many real-world problems with various geometric deformations between the two images. Although significant effort has been made to attack Geometrically-Disparate-Reference IQA (GDR-IQA) problem, it has been addressed in a task-dependent fashion, for example, by dedicated designs for image super-resolution and retargeting, or by assuming the geometric distortions to be small that can be countered by translation-robust filters or by explicit image registrations. Here we rethink this problem and propose a unified, non-training-based Deep Structural Similarity (DeepSSIM) approach to address the above problems in a single framework, which assesses structural similarity of deep features in a simple but efficient way and uses an attention calibration strategy to alleviate attention deviation. The proposed method, without application-specific design, achieves state-of-the-art performance on AR-IQA datasets and meanwhile shows strong robustness to various GDR-IQA test cases. Interestingly, our test also shows the effectiveness of DeepSSIM as an optimization tool for training image super-resolution, enhancement and restoration, implying an even wider generalizability. \footnote{Source code will be made public after the review is completed. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.19553v1-abstract-full').style.display = 'none'; document.getElementById('2412.19553v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.18107">arXiv:2412.18107</a> <span> [<a href="https://arxiv.org/pdf/2412.18107">pdf</a>, <a href="https://arxiv.org/format/2412.18107">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> SongGLM: Lyric-to-Melody Generation with 2D Alignment Encoding and Multi-Task Pre-Training </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yu%2C+J">Jiaxing Yu</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+X">Xinda Wu</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+Y">Yunfei Xu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+T">Tieyao Zhang</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+S">Songruoyao Wu</a>, <a href="/search/eess?searchtype=author&query=Ma%2C+L">Le Ma</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+K">Kejun Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.18107v1-abstract-short" style="display: inline;"> Lyric-to-melody generation aims to automatically create melodies based on given lyrics, requiring the capture of complex and subtle correlations between them. However, previous works usually suffer from two main challenges: 1) lyric-melody alignment modeling, which is often simplified to one-syllable/word-to-one-note alignment, while others have the problem of low alignment accuracy; 2) lyric-melo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18107v1-abstract-full').style.display = 'inline'; document.getElementById('2412.18107v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.18107v1-abstract-full" style="display: none;"> Lyric-to-melody generation aims to automatically create melodies based on given lyrics, requiring the capture of complex and subtle correlations between them. However, previous works usually suffer from two main challenges: 1) lyric-melody alignment modeling, which is often simplified to one-syllable/word-to-one-note alignment, while others have the problem of low alignment accuracy; 2) lyric-melody harmony modeling, which usually relies heavily on intermediates or strict rules, limiting model's capabilities and generative diversity. In this paper, we propose SongGLM, a lyric-to-melody generation system that leverages 2D alignment encoding and multi-task pre-training based on the General Language Model (GLM) to guarantee the alignment and harmony between lyrics and melodies. Specifically, 1) we introduce a unified symbolic song representation for lyrics and melodies with word-level and phrase-level (2D) alignment encoding to capture the lyric-melody alignment; 2) we design a multi-task pre-training framework with hierarchical blank infilling objectives (n-gram, phrase, and long span), and incorporate lyric-melody relationships into the extraction of harmonized n-grams to ensure the lyric-melody harmony. We also construct a large-scale lyric-melody paired dataset comprising over 200,000 English song pieces for pre-training and fine-tuning. The objective and subjective results indicate that SongGLM can generate melodies from lyrics with significant improvements in both alignment and harmony, outperforming all the previous baseline methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18107v1-abstract-full').style.display = 'none'; document.getElementById('2412.18107v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Extended version of paper accepted to AAAI 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.12619">arXiv:2412.12619</a> <span> [<a href="https://arxiv.org/pdf/2412.12619">pdf</a>, <a href="https://arxiv.org/format/2412.12619">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Phoneme-Level Feature Discrepancies: A Key to Detecting Sophisticated Speech Deepfakes </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+K">Kuiyuan Zhang</a>, <a href="/search/eess?searchtype=author&query=Hua%2C+Z">Zhongyun Hua</a>, <a href="/search/eess?searchtype=author&query=Lan%2C+R">Rushi Lan</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Y">Yushu Zhang</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+Y">Yifang Guo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.12619v1-abstract-short" style="display: inline;"> Recent advancements in text-to-speech and speech conversion technologies have enabled the creation of highly convincing synthetic speech. While these innovations offer numerous practical benefits, they also cause significant security challenges when maliciously misused. Therefore, there is an urgent need to detect these synthetic speech signals. Phoneme features provide a powerful speech represent… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.12619v1-abstract-full').style.display = 'inline'; document.getElementById('2412.12619v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.12619v1-abstract-full" style="display: none;"> Recent advancements in text-to-speech and speech conversion technologies have enabled the creation of highly convincing synthetic speech. While these innovations offer numerous practical benefits, they also cause significant security challenges when maliciously misused. Therefore, there is an urgent need to detect these synthetic speech signals. Phoneme features provide a powerful speech representation for deepfake detection. However, previous phoneme-based detection approaches typically focused on specific phonemes, overlooking temporal inconsistencies across the entire phoneme sequence. In this paper, we develop a new mechanism for detecting speech deepfakes by identifying the inconsistencies of phoneme-level speech features. We design an adaptive phoneme pooling technique that extracts sample-specific phoneme-level features from frame-level speech data. By applying this technique to features extracted by pre-trained audio models on previously unseen deepfake datasets, we demonstrate that deepfake samples often exhibit phoneme-level inconsistencies when compared to genuine speech. To further enhance detection accuracy, we propose a deepfake detector that uses a graph attention network to model the temporal dependencies of phoneme-level features. Additionally, we introduce a random phoneme substitution augmentation technique to increase feature diversity during training. Extensive experiments on four benchmark datasets demonstrate the superior performance of our method over existing state-of-the-art detection methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.12619v1-abstract-full').style.display = 'none'; document.getElementById('2412.12619v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.10489">arXiv:2412.10489</a> <span> [<a href="https://arxiv.org/pdf/2412.10489">pdf</a>, <a href="https://arxiv.org/format/2412.10489">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> CognitionCapturer: Decoding Visual Stimuli From Human EEG Signal With Multimodal Information </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+K">Kaifan Zhang</a>, <a href="/search/eess?searchtype=author&query=He%2C+L">Lihuo He</a>, <a href="/search/eess?searchtype=author&query=Jiang%2C+X">Xin Jiang</a>, <a href="/search/eess?searchtype=author&query=Lu%2C+W">Wen Lu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+D">Di Wang</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+X">Xinbo Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.10489v2-abstract-short" style="display: inline;"> Electroencephalogram (EEG) signals have attracted significant attention from researchers due to their non-invasive nature and high temporal sensitivity in decoding visual stimuli. However, most recent studies have focused solely on the relationship between EEG and image data pairs, neglecting the valuable ``beyond-image-modality" information embedded in EEG signals. This results in the loss of cri… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.10489v2-abstract-full').style.display = 'inline'; document.getElementById('2412.10489v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.10489v2-abstract-full" style="display: none;"> Electroencephalogram (EEG) signals have attracted significant attention from researchers due to their non-invasive nature and high temporal sensitivity in decoding visual stimuli. However, most recent studies have focused solely on the relationship between EEG and image data pairs, neglecting the valuable ``beyond-image-modality" information embedded in EEG signals. This results in the loss of critical multimodal information in EEG. To address this limitation, we propose CognitionCapturer, a unified framework that fully leverages multimodal data to represent EEG signals. Specifically, CognitionCapturer trains Modality Expert Encoders for each modality to extract cross-modal information from the EEG modality. Then, it introduces a diffusion prior to map the EEG embedding space to the CLIP embedding space, followed by using a pretrained generative model, the proposed framework can reconstruct visual stimuli with high semantic and structural fidelity. Notably, the framework does not require any fine-tuning of the generative models and can be extended to incorporate more modalities. Through extensive experiments, we demonstrate that CognitionCapturer outperforms state-of-the-art methods both qualitatively and quantitatively. Code: https://github.com/XiaoZhangYES/CognitionCapturer. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.10489v2-abstract-full').style.display = 'none'; document.getElementById('2412.10489v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.08247">arXiv:2412.08247</a> <span> [<a href="https://arxiv.org/pdf/2412.08247">pdf</a>, <a href="https://arxiv.org/format/2412.08247">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> MoMuSE: Momentum Multi-modal Target Speaker Extraction for Real-time Scenarios with Impaired Visual Cues </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+J">Junjie Li</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+K">Ke Zhang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+S">Shuai Wang</a>, <a href="/search/eess?searchtype=author&query=Lee%2C+K+A">Kong Aik Lee</a>, <a href="/search/eess?searchtype=author&query=Li%2C+H">Haizhou Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.08247v1-abstract-short" style="display: inline;"> Audio-visual Target Speaker Extraction (AV-TSE) aims to isolate the speech of a specific target speaker from an audio mixture using time-synchronized visual cues. In real-world scenarios, visual cues are not always available due to various impairments, which undermines the stability of AV-TSE. Despite this challenge, humans can maintain attentional momentum over time, even when the target speaker… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.08247v1-abstract-full').style.display = 'inline'; document.getElementById('2412.08247v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.08247v1-abstract-full" style="display: none;"> Audio-visual Target Speaker Extraction (AV-TSE) aims to isolate the speech of a specific target speaker from an audio mixture using time-synchronized visual cues. In real-world scenarios, visual cues are not always available due to various impairments, which undermines the stability of AV-TSE. Despite this challenge, humans can maintain attentional momentum over time, even when the target speaker is not visible. In this paper, we introduce the Momentum Multi-modal target Speaker Extraction (MoMuSE), which retains a speaker identity momentum in memory, enabling the model to continuously track the target speaker. Designed for real-time inference, MoMuSE extracts the current speech window with guidance from both visual cues and dynamically updated speaker momentum. Experimental results demonstrate that MoMuSE exhibits significant improvement, particularly in scenarios with severe impairment of visual cues. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.08247v1-abstract-full').style.display = 'none'; document.getElementById('2412.08247v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.06581">arXiv:2412.06581</a> <span> </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> EmoSpeech: A Corpus of Emotionally Rich and Contextually Detailed Speech Annotations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Bian%2C+W">Weizhen Bian</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+Y">Yubo Zhou</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+K">Kaitai Zhang</a>, <a href="/search/eess?searchtype=author&query=Gu%2C+X">Xiaohan Gu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.06581v3-abstract-short" style="display: inline;"> Advances in text-to-speech (TTS) technology have significantly improved the quality of generated speech, closely matching the timbre and intonation of the target speaker. However, due to the inherent complexity of human emotional expression, the development of TTS systems capable of controlling subtle emotional differences remains a formidable challenge. Existing emotional speech databases often s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06581v3-abstract-full').style.display = 'inline'; document.getElementById('2412.06581v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.06581v3-abstract-full" style="display: none;"> Advances in text-to-speech (TTS) technology have significantly improved the quality of generated speech, closely matching the timbre and intonation of the target speaker. However, due to the inherent complexity of human emotional expression, the development of TTS systems capable of controlling subtle emotional differences remains a formidable challenge. Existing emotional speech databases often suffer from overly simplistic labelling schemes that fail to capture a wide range of emotional states, thus limiting the effectiveness of emotion synthesis in TTS applications. To this end, recent efforts have focussed on building databases that use natural language annotations to describe speech emotions. However, these approaches are costly and require more emotional depth to train robust systems. In this paper, we propose a novel process aimed at building databases by systematically extracting emotion-rich speech segments and annotating them with detailed natural language descriptions through a generative model. This approach enhances the emotional granularity of the database and significantly reduces the reliance on costly manual annotations by automatically augmenting the data with high-level language models. The resulting rich database provides a scalable and economically viable solution for developing a more nuanced and dynamic basis for developing emotionally controlled TTS systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06581v3-abstract-full').style.display = 'none'; document.getElementById('2412.06581v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">I did not obtain the necessary approval from my academic supervisor prior to submission and there are issues with my current paper</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.01525">arXiv:2412.01525</a> <span> [<a href="https://arxiv.org/pdf/2412.01525">pdf</a>, <a href="https://arxiv.org/format/2412.01525">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Take Your Steps: Hierarchically Efficient Pulmonary Disease Screening via CT Volume Compression </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Shao%2C+Q">Qian Shao</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+K">Kai Zhang</a>, <a href="/search/eess?searchtype=author&query=Du%2C+B">Bang Du</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Z">Zepeng Li</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+Y">Yixuan Wu</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Q">Qiyuan Chen</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+J">Jian Wu</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+J">Jintai Chen</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+H">Honghao Gao</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+H">Hongxia Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.01525v2-abstract-short" style="display: inline;"> Deep learning models are widely used to process Computed Tomography (CT) data in the automated screening of pulmonary diseases, significantly reducing the workload of physicians. However, the three-dimensional nature of CT volumes involves an excessive number of voxels, which significantly increases the complexity of model processing. Previous screening approaches often overlook this issue, which… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.01525v2-abstract-full').style.display = 'inline'; document.getElementById('2412.01525v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.01525v2-abstract-full" style="display: none;"> Deep learning models are widely used to process Computed Tomography (CT) data in the automated screening of pulmonary diseases, significantly reducing the workload of physicians. However, the three-dimensional nature of CT volumes involves an excessive number of voxels, which significantly increases the complexity of model processing. Previous screening approaches often overlook this issue, which undoubtedly reduces screening efficiency. Towards efficient and effective screening, we design a hierarchical approach to reduce the computational cost of pulmonary disease screening. The new approach re-organizes the screening workflows into three steps. First, we propose a Computed Tomography Volume Compression (CTVC) method to select a small slice subset that comprehensively represents the whole CT volume. Second, the selected CT slices are used to detect pulmonary diseases coarsely via a lightweight classification model. Third, an uncertainty measurement strategy is applied to identify samples with low diagnostic confidence, which are re-detected by radiologists. Experiments on two public pulmonary disease datasets demonstrate that our approach achieves comparable accuracy and recall while reducing the time by 50%-70% compared with the counterparts using full CT volumes. Besides, we also found that our approach outperforms previous cutting-edge CTVC methods in retaining important indications after compression. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.01525v2-abstract-full').style.display = 'none'; document.getElementById('2412.01525v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Under Review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11574">arXiv:2411.11574</a> <span> [<a href="https://arxiv.org/pdf/2411.11574">pdf</a>, <a href="https://arxiv.org/ps/2411.11574">ps</a>, <a href="https://arxiv.org/format/2411.11574">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Reduced Network Cumulative Constraint Violation for Distributed Bandit Convex Optimization under Slater Condition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+K">Kunpeng Zhang</a>, <a href="/search/eess?searchtype=author&query=Yi%2C+X">Xinlei Yi</a>, <a href="/search/eess?searchtype=author&query=Ding%2C+J">Jinliang Ding</a>, <a href="/search/eess?searchtype=author&query=Cao%2C+M">Ming Cao</a>, <a href="/search/eess?searchtype=author&query=Johansson%2C+K+H">Karl H. Johansson</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+T">Tao Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11574v1-abstract-short" style="display: inline;"> This paper studies the distributed bandit convex optimization problem with time-varying inequality constraints, where the goal is to minimize network regret and cumulative constraint violation. To calculate network cumulative constraint violation, existing distributed bandit online algorithms solving this problem directly use the clipped constraint function to replace its original constraint funct… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11574v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11574v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11574v1-abstract-full" style="display: none;"> This paper studies the distributed bandit convex optimization problem with time-varying inequality constraints, where the goal is to minimize network regret and cumulative constraint violation. To calculate network cumulative constraint violation, existing distributed bandit online algorithms solving this problem directly use the clipped constraint function to replace its original constraint function. However, the use of the clipping operation renders Slater condition (i.e, there exists a point that strictly satisfies the inequality constraints at all iterations) ineffective to achieve reduced network cumulative constraint violation. To tackle this challenge, we propose a new distributed bandit online primal-dual algorithm. If local loss functions are convex, we show that the proposed algorithm establishes sublinear network regret and cumulative constraint violation bounds. When Slater condition holds, the network cumulative constraint violation bound is reduced. In addition, if local loss functions are strongly convex, for the case where strongly convex parameters are unknown, the network regret bound is reduced. For the case where strongly convex parameters are known, the network regret and cumulative constraint violation bounds are further reduced. To the best of our knowledge, this paper is among the first to establish reduced (network) cumulative constraint violation bounds for (distributed) bandit convex optimization with time-varying constraints under Slater condition. Finally, a numerical example is provided to verify the theoretical results. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11574v1-abstract-full').style.display = 'none'; document.getElementById('2411.11574v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">arXiv admin note: text overlap with arXiv:2406.14060, arXiv:2306.00149</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09167">arXiv:2411.09167</a> <span> [<a href="https://arxiv.org/pdf/2411.09167">pdf</a>, <a href="https://arxiv.org/format/2411.09167">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Robust AI-Synthesized Speech Detection Using Feature Decomposition Learning and Synthesizer Feature Augmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+K">Kuiyuan Zhang</a>, <a href="/search/eess?searchtype=author&query=Hua%2C+Z">Zhongyun Hua</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Y">Yushu Zhang</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+Y">Yifang Guo</a>, <a href="/search/eess?searchtype=author&query=Xiang%2C+T">Tao Xiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09167v1-abstract-short" style="display: inline;"> AI-synthesized speech, also known as deepfake speech, has recently raised significant concerns due to the rapid advancement of speech synthesis and speech conversion techniques. Previous works often rely on distinguishing synthesizer artifacts to identify deepfake speech. However, excessive reliance on these specific synthesizer artifacts may result in unsatisfactory performance when addressing sp… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09167v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09167v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09167v1-abstract-full" style="display: none;"> AI-synthesized speech, also known as deepfake speech, has recently raised significant concerns due to the rapid advancement of speech synthesis and speech conversion techniques. Previous works often rely on distinguishing synthesizer artifacts to identify deepfake speech. However, excessive reliance on these specific synthesizer artifacts may result in unsatisfactory performance when addressing speech signals created by unseen synthesizers. In this paper, we propose a robust deepfake speech detection method that employs feature decomposition to learn synthesizer-independent content features as complementary for detection. Specifically, we propose a dual-stream feature decomposition learning strategy that decomposes the learned speech representation using a synthesizer stream and a content stream. The synthesizer stream specializes in learning synthesizer features through supervised training with synthesizer labels. Meanwhile, the content stream focuses on learning synthesizer-independent content features, enabled by a pseudo-labeling-based supervised learning method. This method randomly transforms speech to generate speed and compression labels for training. Additionally, we employ an adversarial learning technique to reduce the synthesizer-related components in the content stream. The final classification is determined by concatenating the synthesizer and content features. To enhance the model's robustness to different synthesizer characteristics, we further propose a synthesizer feature augmentation strategy that randomly blends the characteristic styles within real and fake audio features and randomly shuffles the synthesizer features with the content features. This strategy effectively enhances the feature diversity and simulates more feature combinations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09167v1-abstract-full').style.display = 'none'; document.getElementById('2411.09167v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03910">arXiv:2411.03910</a> <span> [<a href="https://arxiv.org/pdf/2411.03910">pdf</a>, <a href="https://arxiv.org/ps/2411.03910">ps</a>, <a href="https://arxiv.org/format/2411.03910">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> WiP: Towards a Secure SECP256K1 for Crypto Wallets: Hardware Architecture and Implementation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Lemayian%2C+J+P">Joel Poncha Lemayian</a>, <a href="/search/eess?searchtype=author&query=Gagnon%2C+G">Ghyslain Gagnon</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+K">Kaiwen Zhang</a>, <a href="/search/eess?searchtype=author&query=Giard%2C+P">Pascal Giard</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03910v1-abstract-short" style="display: inline;"> The SECP256K1 elliptic curve algorithm is fundamental in cryptocurrency wallets for generating secure public keys from private keys, thereby ensuring the protection and ownership of blockchain-based digital assets. However, the literature highlights several successful side-channel attacks on hardware wallets that exploit SECP256K1 to extract private keys. This work proposes a novel hardware archit… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03910v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03910v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03910v1-abstract-full" style="display: none;"> The SECP256K1 elliptic curve algorithm is fundamental in cryptocurrency wallets for generating secure public keys from private keys, thereby ensuring the protection and ownership of blockchain-based digital assets. However, the literature highlights several successful side-channel attacks on hardware wallets that exploit SECP256K1 to extract private keys. This work proposes a novel hardware architecture for SECP256K1, optimized for side-channel attack resistance and efficient resource utilization. The architecture incorporates complete addition formulas, temporary registers, and parallel processing techniques, making elliptic curve point addition and doubling operations indistinguishable. Implementation results demonstrate an average reduction of 45% in LUT usage compared to similar works, emphasizing the design's resource efficiency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03910v1-abstract-full').style.display = 'none'; document.getElementById('2411.03910v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Presented at HASP 2024 @ MICRO 2024 https://haspworkshop.org/2024/program.html</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03723">arXiv:2411.03723</a> <span> [<a href="https://arxiv.org/pdf/2411.03723">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Zero-shot Dynamic MRI Reconstruction with Global-to-local Diffusion Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Guan%2C+Y">Yu Guan</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+K">Kunlong Zhang</a>, <a href="/search/eess?searchtype=author&query=Qi%2C+Q">Qi Qi</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+D">Dong Wang</a>, <a href="/search/eess?searchtype=author&query=Ke%2C+Z">Ziwen Ke</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+S">Shaoyu Wang</a>, <a href="/search/eess?searchtype=author&query=Liang%2C+D">Dong Liang</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+Q">Qiegen Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03723v1-abstract-short" style="display: inline;"> Diffusion models have recently demonstrated considerable advancement in the generation and reconstruction of magnetic resonance imaging (MRI) data. These models exhibit great potential in handling unsampled data and reducing noise, highlighting their promise as generative models. However, their application in dynamic MRI remains relatively underexplored. This is primarily due to the substantial am… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03723v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03723v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03723v1-abstract-full" style="display: none;"> Diffusion models have recently demonstrated considerable advancement in the generation and reconstruction of magnetic resonance imaging (MRI) data. These models exhibit great potential in handling unsampled data and reducing noise, highlighting their promise as generative models. However, their application in dynamic MRI remains relatively underexplored. This is primarily due to the substantial amount of fully-sampled data typically required for training, which is difficult to obtain in dynamic MRI due to its spatio-temporal complexity and high acquisition costs. To address this challenge, we propose a dynamic MRI reconstruction method based on a time-interleaved acquisition scheme, termed the Glob-al-to-local Diffusion Model. Specifically, fully encoded full-resolution reference data are constructed by merging under-sampled k-space data from adjacent time frames, generating two distinct bulk training datasets for global and local models. The global-to-local diffusion framework alternately optimizes global information and local image details, enabling zero-shot reconstruction. Extensive experiments demonstrate that the proposed method performs well in terms of noise reduction and detail preservation, achieving reconstruction quality comparable to that of supervised approaches. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03723v1-abstract-full').style.display = 'none'; document.getElementById('2411.03723v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 9 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.16059">arXiv:2410.16059</a> <span> [<a href="https://arxiv.org/pdf/2410.16059">pdf</a>, <a href="https://arxiv.org/format/2410.16059">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Multi-Level Speaker Representation for Target Speaker Extraction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+K">Ke Zhang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+J">Junjie Li</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+S">Shuai Wang</a>, <a href="/search/eess?searchtype=author&query=Wei%2C+Y">Yangjie Wei</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yi Wang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yannan Wang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+H">Haizhou Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.16059v2-abstract-short" style="display: inline;"> Target speaker extraction (TSE) relies on a reference cue of the target to extract the target speech from a speech mixture. While a speaker embedding is commonly used as the reference cue, such embedding pre-trained with a large number of speakers may suffer from confusion of speaker identity. In this work, we propose a multi-level speaker representation approach, from raw features to neural embed… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16059v2-abstract-full').style.display = 'inline'; document.getElementById('2410.16059v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.16059v2-abstract-full" style="display: none;"> Target speaker extraction (TSE) relies on a reference cue of the target to extract the target speech from a speech mixture. While a speaker embedding is commonly used as the reference cue, such embedding pre-trained with a large number of speakers may suffer from confusion of speaker identity. In this work, we propose a multi-level speaker representation approach, from raw features to neural embeddings, to serve as the speaker reference cue. We generate a spectral-level representation from the enrollment magnitude spectrogram as a raw, low-level feature, which significantly improves the model's generalization capability. Additionally, we propose a contextual embedding feature based on cross-attention mechanisms that integrate frame-level embeddings from a pre-trained speaker encoder. By incorporating speaker features across multiple levels, we significantly enhance the performance of the TSE model. Our approach achieves a 2.74 dB improvement and a 4.94% increase in extraction accuracy on Libri2mix test set over the baseline. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16059v2-abstract-full').style.display = 'none'; document.getElementById('2410.16059v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages. Submitted to ICASSP 2025. Implementation will be released at https://github.com/wenet-e2e/wesep</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15646">arXiv:2410.15646</a> <span> [<a href="https://arxiv.org/pdf/2410.15646">pdf</a>, <a href="https://arxiv.org/format/2410.15646">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Low-Complexity Minimum BER Precoder Design for ISAC Systems: A Delay-Doppler Perspective </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wu%2C+J">Jun Wu</a>, <a href="/search/eess?searchtype=author&query=Yuan%2C+W">Weijie Yuan</a>, <a href="/search/eess?searchtype=author&query=Wei%2C+Z">Zhiqiang Wei</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+K">Kecheng Zhang</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+F">Fan Liu</a>, <a href="/search/eess?searchtype=author&query=Ng%2C+D+W+K">Derrick Wing Kwan Ng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15646v1-abstract-short" style="display: inline;"> Orthogonal time frequency space (OTFS) modulation is anticipated to be a promising candidate for supporting integrated sensing and communications (ISAC) systems, which is considered as a pivotal technique for realizing next generation wireless networks. In this paper, we develop a minimum bit error rate (BER) precoder design for an OTFS-based ISAC system. In particular, the BER minimization proble… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15646v1-abstract-full').style.display = 'inline'; document.getElementById('2410.15646v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15646v1-abstract-full" style="display: none;"> Orthogonal time frequency space (OTFS) modulation is anticipated to be a promising candidate for supporting integrated sensing and communications (ISAC) systems, which is considered as a pivotal technique for realizing next generation wireless networks. In this paper, we develop a minimum bit error rate (BER) precoder design for an OTFS-based ISAC system. In particular, the BER minimization problem takes into account the maximum available transmission power budget and the required sensing performance. Different from prior studies that considered ISAC in the time-frequency (TF) domain, we devise the precoder from the perspective of the delay-Doppler (DD) domain by exploiting the equivalent DD domain channel due to the fact that the DD domain channel generally tends to be sparse and quasi-static, which can facilitate a low-overhead ISAC system design. To address the non-convex optimization design problem, we resort to optimizing the lower bound of the derived average BER by adopting Jensen's inequality. Subsequently, the formulated problem is decoupled into two independent sub-problems via singular value decomposition (SVD) methodology. We then theoretically analyze the feasibility conditions of the proposed problem and present a low-complexity iterative solution via leveraging the Lagrangian duality approach. Simulation results verify the effectiveness of our proposed precoder compared to the benchmark schemes and reveal the interplay between sensing and communication for dual-functional precoder design, indicating a trade-off where transmission efficiency is sacrificed for increasing transmission reliability and sensing accuracy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15646v1-abstract-full').style.display = 'none'; document.getElementById('2410.15646v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.14214">arXiv:2410.14214</a> <span> [<a href="https://arxiv.org/pdf/2410.14214">pdf</a>, <a href="https://arxiv.org/format/2410.14214">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> MambaSCI: Efficient Mamba-UNet for Quad-Bayer Patterned Video Snapshot Compressive Imaging </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Pan%2C+Z">Zhenghao Pan</a>, <a href="/search/eess?searchtype=author&query=Zeng%2C+H">Haijin Zeng</a>, <a href="/search/eess?searchtype=author&query=Cao%2C+J">Jiezhang Cao</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yongyong Chen</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+K">Kai Zhang</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+Y">Yong Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.14214v1-abstract-short" style="display: inline;"> Color video snapshot compressive imaging (SCI) employs computational imaging techniques to capture multiple sequential video frames in a single Bayer-patterned measurement. With the increasing popularity of quad-Bayer pattern in mainstream smartphone cameras for capturing high-resolution videos, mobile photography has become more accessible to a wider audience. However, existing color video SCI re… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14214v1-abstract-full').style.display = 'inline'; document.getElementById('2410.14214v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.14214v1-abstract-full" style="display: none;"> Color video snapshot compressive imaging (SCI) employs computational imaging techniques to capture multiple sequential video frames in a single Bayer-patterned measurement. With the increasing popularity of quad-Bayer pattern in mainstream smartphone cameras for capturing high-resolution videos, mobile photography has become more accessible to a wider audience. However, existing color video SCI reconstruction algorithms are designed based on the traditional Bayer pattern. When applied to videos captured by quad-Bayer cameras, these algorithms often result in color distortion and ineffective demosaicing, rendering them impractical for primary equipment. To address this challenge, we propose the MambaSCI method, which leverages the Mamba and UNet architectures for efficient reconstruction of quad-Bayer patterned color video SCI. To the best of our knowledge, our work presents the first algorithm for quad-Bayer patterned SCI reconstruction, and also the initial application of the Mamba model to this task. Specifically, we customize Residual-Mamba-Blocks, which residually connect the Spatial-Temporal Mamba (STMamba), Edge-Detail-Reconstruction (EDR) module, and Channel Attention (CA) module. Respectively, STMamba is used to model long-range spatial-temporal dependencies with linear complexity, EDR is for better edge-detail reconstruction, and CA is used to compensate for the missing channel information interaction in Mamba model. Experiments demonstrate that MambaSCI surpasses state-of-the-art methods with lower computational and memory costs. PyTorch style pseudo-code for the core modules is provided in the supplementary materials. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14214v1-abstract-full').style.display = 'none'; document.getElementById('2410.14214v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.09706">arXiv:2410.09706</a> <span> [<a href="https://arxiv.org/pdf/2410.09706">pdf</a>, <a href="https://arxiv.org/format/2410.09706">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> ECVC: Exploiting Non-Local Correlations in Multiple Frames for Contextual Video Compression </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Jiang%2C+W">Wei Jiang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+J">Junru Li</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+K">Kai Zhang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+L">Li Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.09706v2-abstract-short" style="display: inline;"> In Learned Video Compression (LVC), improving inter prediction, such as enhancing temporal context mining and mitigating accumulated errors, is crucial for boosting rate-distortion performance. Existing LVCs mainly focus on mining the temporal movements while neglecting non-local correlations among frames. Additionally, current contextual video compression models use a single reference frame, whic… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09706v2-abstract-full').style.display = 'inline'; document.getElementById('2410.09706v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.09706v2-abstract-full" style="display: none;"> In Learned Video Compression (LVC), improving inter prediction, such as enhancing temporal context mining and mitigating accumulated errors, is crucial for boosting rate-distortion performance. Existing LVCs mainly focus on mining the temporal movements while neglecting non-local correlations among frames. Additionally, current contextual video compression models use a single reference frame, which is insufficient for handling complex movements. To address these issues, we propose leveraging non-local correlations across multiple frames to enhance temporal priors, significantly boosting rate-distortion performance. To mitigate error accumulation, we introduce a partial cascaded fine-tuning strategy that supports fine-tuning on full-length sequences with constrained computational resources. This method reduces the train-test mismatch in sequence lengths and significantly decreases accumulated errors. Based on the proposed techniques, we present a video compression scheme ECVC. Experiments demonstrate that our ECVC achieves state-of-the-art performance, reducing $10.5\%$ and $11.5\%$ more bit-rates than previous SOTA method DCVC-FM over VTM-13.2 low delay B (LDB) under the intra period (IP) of $32$ and $-1$, respectively. Code will be available at https://github.com/JiangWeibeta/ECVC. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09706v2-abstract-full').style.display = 'none'; document.getElementById('2410.09706v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Code will be available at https://github.com/JiangWeibeta/ECVC</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.02121">arXiv:2410.02121</a> <span> [<a href="https://arxiv.org/pdf/2410.02121">pdf</a>, <a href="https://arxiv.org/format/2410.02121">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> SC-CDM: Enhancing Quality of Image Semantic Communication with a Compact Diffusion Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+K">Kexin Zhang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+L">Lixin Li</a>, <a href="/search/eess?searchtype=author&query=Lin%2C+W">Wensheng Lin</a>, <a href="/search/eess?searchtype=author&query=Yan%2C+Y">Yuna Yan</a>, <a href="/search/eess?searchtype=author&query=Cheng%2C+W">Wenchi Cheng</a>, <a href="/search/eess?searchtype=author&query=Han%2C+Z">Zhu Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.02121v1-abstract-short" style="display: inline;"> Semantic Communication (SC) is an emerging technology that has attracted much attention in the sixth-generation (6G) mobile communication systems. However, few literature has fully considered the perceptual quality of the reconstructed image. To solve this problem, we propose a generative SC for wireless image transmission (denoted as SC-CDM). This approach leverages compact diffusion models to im… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02121v1-abstract-full').style.display = 'inline'; document.getElementById('2410.02121v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.02121v1-abstract-full" style="display: none;"> Semantic Communication (SC) is an emerging technology that has attracted much attention in the sixth-generation (6G) mobile communication systems. However, few literature has fully considered the perceptual quality of the reconstructed image. To solve this problem, we propose a generative SC for wireless image transmission (denoted as SC-CDM). This approach leverages compact diffusion models to improve the fidelity and semantic accuracy of the images reconstructed after transmission, ensuring that the essential content is preserved even in bandwidth-constrained environments. Specifically, we aim to redesign the swin Transformer as a new backbone for efficient semantic feature extraction and compression. Next, the receiver integrates the slim prior and image reconstruction networks. Compared to traditional Diffusion Models (DMs), it leverages DMs' robust distribution mapping capability to generate a compact condition vector, guiding image recovery, thus enhancing the perceptual details of the reconstructed images. Finally, a series of evaluation and ablation studies are conducted to validate the effectiveness and robustness of the proposed algorithm and further increase the Peak Signal-to-Noise Ratio (PSNR) by over 17% on top of CNN-based DeepJSCC. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02121v1-abstract-full').style.display = 'none'; document.getElementById('2410.02121v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">arXiv admin note: text overlap with arXiv:2408.05112</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.01654">arXiv:2410.01654</a> <span> [<a href="https://arxiv.org/pdf/2410.01654">pdf</a>, <a href="https://arxiv.org/format/2410.01654">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> Releasing the Parameter Latency of Neural Representation for High-Efficiency Video Compression </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+G">Gai Zhang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+X">Xinfeng Zhang</a>, <a href="/search/eess?searchtype=author&query=Tang%2C+L">Lv Tang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yue Li</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+K">Kai Zhang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+L">Li Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.01654v2-abstract-short" style="display: inline;"> For decades, video compression technology has been a prominent research area. Traditional hybrid video compression framework and end-to-end frameworks continue to explore various intra- and inter-frame reference and prediction strategies based on discrete transforms and deep learning techniques. However, the emerging implicit neural representation (INR) technique models entire videos as basic unit… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01654v2-abstract-full').style.display = 'inline'; document.getElementById('2410.01654v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.01654v2-abstract-full" style="display: none;"> For decades, video compression technology has been a prominent research area. Traditional hybrid video compression framework and end-to-end frameworks continue to explore various intra- and inter-frame reference and prediction strategies based on discrete transforms and deep learning techniques. However, the emerging implicit neural representation (INR) technique models entire videos as basic units, automatically capturing intra-frame and inter-frame correlations and obtaining promising performance. INR uses a compact neural network to store video information in network parameters, effectively eliminating spatial and temporal redundancy in the original video. However, in this paper, our exploration and verification reveal that current INR video compression methods do not fully exploit their potential to preserve information. We investigate the potential of enhancing network parameter storage through parameter reuse. By deepening the network, we designed a feasible INR parameter reuse scheme to further improve compression performance. Extensive experimental results show that our method significantly enhances the rate-distortion performance of INR video compression. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01654v2-abstract-full').style.display = 'none'; document.getElementById('2410.01654v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.16431">arXiv:2409.16431</a> <span> [<a href="https://arxiv.org/pdf/2409.16431">pdf</a>, <a href="https://arxiv.org/format/2409.16431">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Hand Gesture Classification Based on Forearm Ultrasound Video Snippets Using 3D Convolutional Neural Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Bimbraw%2C+K">Keshav Bimbraw</a>, <a href="/search/eess?searchtype=author&query=Talele%2C+A">Ankit Talele</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+H+K">Haichong K. Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.16431v1-abstract-short" style="display: inline;"> Ultrasound based hand movement estimation is a crucial area of research with applications in human-machine interaction. Forearm ultrasound offers detailed information about muscle morphology changes during hand movement which can be used to estimate hand gestures. Previous work has focused on analyzing 2-Dimensional (2D) ultrasound image frames using techniques such as convolutional neural network… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.16431v1-abstract-full').style.display = 'inline'; document.getElementById('2409.16431v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.16431v1-abstract-full" style="display: none;"> Ultrasound based hand movement estimation is a crucial area of research with applications in human-machine interaction. Forearm ultrasound offers detailed information about muscle morphology changes during hand movement which can be used to estimate hand gestures. Previous work has focused on analyzing 2-Dimensional (2D) ultrasound image frames using techniques such as convolutional neural networks (CNNs). However, such 2D techniques do not capture temporal features from segments of ultrasound data corresponding to continuous hand movements. This study uses 3D CNN based techniques to capture spatio-temporal patterns within ultrasound video segments for gesture recognition. We compared the performance of a 2D convolution-based network with (2+1)D convolution-based, 3D convolution-based, and our proposed network. Our methodology enhanced the gesture classification accuracy to 98.8 +/- 0.9%, from 96.5 +/- 2.3% compared to a network trained with 2D convolution layers. These results demonstrate the advantages of using ultrasound video snippets for improving hand gesture classification performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.16431v1-abstract-full').style.display = 'none'; document.getElementById('2409.16431v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to IUS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.15799">arXiv:2409.15799</a> <span> [<a href="https://arxiv.org/pdf/2409.15799">pdf</a>, <a href="https://arxiv.org/format/2409.15799">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> WeSep: A Scalable and Flexible Toolkit Towards Generalizable Target Speaker Extraction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+S">Shuai Wang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+K">Ke Zhang</a>, <a href="/search/eess?searchtype=author&query=Lin%2C+S">Shaoxiong Lin</a>, <a href="/search/eess?searchtype=author&query=Li%2C+J">Junjie Li</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+X">Xuefei Wang</a>, <a href="/search/eess?searchtype=author&query=Ge%2C+M">Meng Ge</a>, <a href="/search/eess?searchtype=author&query=Yu%2C+J">Jianwei Yu</a>, <a href="/search/eess?searchtype=author&query=Qian%2C+Y">Yanmin Qian</a>, <a href="/search/eess?searchtype=author&query=Li%2C+H">Haizhou Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.15799v1-abstract-short" style="display: inline;"> Target speaker extraction (TSE) focuses on isolating the speech of a specific target speaker from overlapped multi-talker speech, which is a typical setup in the cocktail party problem. In recent years, TSE draws increasing attention due to its potential for various applications such as user-customized interfaces and hearing aids, or as a crutial front-end processing technologies for subsequential… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15799v1-abstract-full').style.display = 'inline'; document.getElementById('2409.15799v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.15799v1-abstract-full" style="display: none;"> Target speaker extraction (TSE) focuses on isolating the speech of a specific target speaker from overlapped multi-talker speech, which is a typical setup in the cocktail party problem. In recent years, TSE draws increasing attention due to its potential for various applications such as user-customized interfaces and hearing aids, or as a crutial front-end processing technologies for subsequential tasks such as speech recognition and speaker recongtion. However, there are currently few open-source toolkits or available pre-trained models for off-the-shelf usage. In this work, we introduce WeSep, a toolkit designed for research and practical applications in TSE. WeSep is featured with flexible target speaker modeling, scalable data management, effective on-the-fly data simulation, structured recipes and deployment support. The toolkit is publicly avaliable at \url{https://github.com/wenet-e2e/WeSep.} <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15799v1-abstract-full').style.display = 'none'; document.getElementById('2409.15799v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Interspeech 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.11299">arXiv:2409.11299</a> <span> [<a href="https://arxiv.org/pdf/2409.11299">pdf</a>, <a href="https://arxiv.org/format/2409.11299">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> TTT-Unet: Enhancing U-Net with Test-Time Training Layers for Biomedical Image Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhou%2C+R">Rong Zhou</a>, <a href="/search/eess?searchtype=author&query=Yuan%2C+Z">Zhengqing Yuan</a>, <a href="/search/eess?searchtype=author&query=Yan%2C+Z">Zhiling Yan</a>, <a href="/search/eess?searchtype=author&query=Sun%2C+W">Weixiang Sun</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+K">Kai Zhang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yiwei Li</a>, <a href="/search/eess?searchtype=author&query=Ye%2C+Y">Yanfang Ye</a>, <a href="/search/eess?searchtype=author&query=Li%2C+X">Xiang Li</a>, <a href="/search/eess?searchtype=author&query=He%2C+L">Lifang He</a>, <a href="/search/eess?searchtype=author&query=Sun%2C+L">Lichao Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.11299v3-abstract-short" style="display: inline;"> Biomedical image segmentation is crucial for accurately diagnosing and analyzing various diseases. However, Convolutional Neural Networks (CNNs) and Transformers, the most commonly used architectures for this task, struggle to effectively capture long-range dependencies due to the inherent locality of CNNs and the computational complexity of Transformers. To address this limitation, we introduce T… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.11299v3-abstract-full').style.display = 'inline'; document.getElementById('2409.11299v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.11299v3-abstract-full" style="display: none;"> Biomedical image segmentation is crucial for accurately diagnosing and analyzing various diseases. However, Convolutional Neural Networks (CNNs) and Transformers, the most commonly used architectures for this task, struggle to effectively capture long-range dependencies due to the inherent locality of CNNs and the computational complexity of Transformers. To address this limitation, we introduce TTT-Unet, a novel framework that integrates Test-Time Training (TTT) layers into the traditional U-Net architecture for biomedical image segmentation. TTT-Unet dynamically adjusts model parameters during the testing time, enhancing the model's ability to capture both local and long-range features. We evaluate TTT-Unet on multiple medical imaging datasets, including 3D abdominal organ segmentation in CT and MR images, instrument segmentation in endoscopy images, and cell segmentation in microscopy images. The results demonstrate that TTT-Unet consistently outperforms state-of-the-art CNN-based and Transformer-based segmentation models across all tasks. The code is available at https://github.com/rongzhou7/TTT-Unet. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.11299v3-abstract-full').style.display = 'none'; document.getElementById('2409.11299v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.09589">arXiv:2409.09589</a> <span> [<a href="https://arxiv.org/pdf/2409.09589">pdf</a>, <a href="https://arxiv.org/format/2409.09589">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> On the effectiveness of enrollment speech augmentation for Target Speaker Extraction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+J">Junjie Li</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+K">Ke Zhang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+S">Shuai Wang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+H">Haizhou Li</a>, <a href="/search/eess?searchtype=author&query=Mak%2C+M">Man-Wai Mak</a>, <a href="/search/eess?searchtype=author&query=Lee%2C+K+A">Kong Aik Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.09589v1-abstract-short" style="display: inline;"> Deep learning technologies have significantly advanced the performance of target speaker extraction (TSE) tasks. To enhance the generalization and robustness of these algorithms when training data is insufficient, data augmentation is a commonly adopted technique. Unlike typical data augmentation applied to speech mixtures, this work thoroughly investigates the effectiveness of augmenting the enro… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09589v1-abstract-full').style.display = 'inline'; document.getElementById('2409.09589v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.09589v1-abstract-full" style="display: none;"> Deep learning technologies have significantly advanced the performance of target speaker extraction (TSE) tasks. To enhance the generalization and robustness of these algorithms when training data is insufficient, data augmentation is a commonly adopted technique. Unlike typical data augmentation applied to speech mixtures, this work thoroughly investigates the effectiveness of augmenting the enrollment speech space. We found that for both pretrained and jointly optimized speaker encoders, directly augmenting the enrollment speech leads to consistent performance improvement. In addition to conventional methods such as noise and reverberation addition, we propose a novel augmentation method called self-estimated speech augmentation (SSA). Experimental results on the Libri2Mix test set show that our proposed method can achieve an improvement of up to 2.5 dB. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09589v1-abstract-full').style.display = 'none'; document.getElementById('2409.09589v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by SLT2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.03844">arXiv:2409.03844</a> <span> [<a href="https://arxiv.org/pdf/2409.03844">pdf</a>, <a href="https://arxiv.org/format/2409.03844">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> MetaBGM: Dynamic Soundtrack Transformation For Continuous Multi-Scene Experiences With Ambient Awareness And Personalization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Liu%2C+H">Haoxuan Liu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Z">Zihao Wang</a>, <a href="/search/eess?searchtype=author&query=Hong%2C+H">Haorong Hong</a>, <a href="/search/eess?searchtype=author&query=Feng%2C+Y">Youwei Feng</a>, <a href="/search/eess?searchtype=author&query=Yu%2C+J">Jiaxin Yu</a>, <a href="/search/eess?searchtype=author&query=Diao%2C+H">Han Diao</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+Y">Yunfei Xu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+K">Kejun Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.03844v1-abstract-short" style="display: inline;"> This paper introduces MetaBGM, a groundbreaking framework for generating background music that adapts to dynamic scenes and real-time user interactions. We define multi-scene as variations in environmental contexts, such as transitions in game settings or movie scenes. To tackle the challenge of converting backend data into music description texts for audio generation models, MetaBGM employs a nov… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.03844v1-abstract-full').style.display = 'inline'; document.getElementById('2409.03844v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.03844v1-abstract-full" style="display: none;"> This paper introduces MetaBGM, a groundbreaking framework for generating background music that adapts to dynamic scenes and real-time user interactions. We define multi-scene as variations in environmental contexts, such as transitions in game settings or movie scenes. To tackle the challenge of converting backend data into music description texts for audio generation models, MetaBGM employs a novel two-stage generation approach that transforms continuous scene and user state data into these texts, which are then fed into an audio generation model for real-time soundtrack creation. Experimental results demonstrate that MetaBGM effectively generates contextually relevant and dynamic background music for interactive applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.03844v1-abstract-full').style.display = 'none'; document.getElementById('2409.03844v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.17186">arXiv:2408.17186</a> <span> [<a href="https://arxiv.org/pdf/2408.17186">pdf</a>, <a href="https://arxiv.org/format/2408.17186">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> "Benefit Game: Alien Seaweed Swarms" -- Real-time Gamification of Digital Seaweed Ecology </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Fei%2C+D">Dan-Lu Fei</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+Z">Zi-Wei Wu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+K">Kang Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.17186v1-abstract-short" style="display: inline;"> "Benefit Game: Alien Seaweed Swarms" combines artificial life art and interactive game with installation to explore the impact of human activity on fragile seaweed ecosystems. The project aims to promote ecological consciousness by creating a balance in digital seaweed ecologies. Inspired by the real species "Laminaria saccharina", the author employs Procedural Content Generation via Machine Learn… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.17186v1-abstract-full').style.display = 'inline'; document.getElementById('2408.17186v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.17186v1-abstract-full" style="display: none;"> "Benefit Game: Alien Seaweed Swarms" combines artificial life art and interactive game with installation to explore the impact of human activity on fragile seaweed ecosystems. The project aims to promote ecological consciousness by creating a balance in digital seaweed ecologies. Inspired by the real species "Laminaria saccharina", the author employs Procedural Content Generation via Machine Learning technology to generate variations of virtual seaweeds and symbiotic fungi. The audience can explore the consequences of human activities through gameplay and observe the ecosystem's feedback on the benefits and risks of seaweed aquaculture. This Benefit Game offers dynamic and real-time responsive artificial seaweed ecosystems for an interactive experience that enhances ecological consciousness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.17186v1-abstract-full').style.display = 'none'; document.getElementById('2408.17186v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Paper accepted at ISEA 24, The 29th International Symposium on Electronic Art, Brisbane, Australia, 21-29 June 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.16338">arXiv:2408.16338</a> <span> [<a href="https://arxiv.org/pdf/2408.16338">pdf</a>, <a href="https://arxiv.org/ps/2408.16338">ps</a>, <a href="https://arxiv.org/format/2408.16338">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Deep DeePC: Data-enabled predictive control with low or no online optimization using deep learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+X">Xuewen Zhang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+K">Kaixiang Zhang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Z">Zhaojian Li</a>, <a href="/search/eess?searchtype=author&query=Yin%2C+X">Xunyuan Yin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.16338v3-abstract-short" style="display: inline;"> Data-enabled predictive control (DeePC) is a data-driven control algorithm that utilizes data matrices to form a non-parametric representation of the underlying system, predicting future behaviors and generating optimal control actions. DeePC typically requires solving an online optimization problem, the complexity of which is heavily influenced by the amount of data used, potentially leading to e… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.16338v3-abstract-full').style.display = 'inline'; document.getElementById('2408.16338v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.16338v3-abstract-full" style="display: none;"> Data-enabled predictive control (DeePC) is a data-driven control algorithm that utilizes data matrices to form a non-parametric representation of the underlying system, predicting future behaviors and generating optimal control actions. DeePC typically requires solving an online optimization problem, the complexity of which is heavily influenced by the amount of data used, potentially leading to expensive online computation. In this paper, we leverage deep learning to propose a highly computationally efficient DeePC approach for general nonlinear processes, referred to as Deep DeePC. Specifically, a deep neural network is employed to learn the DeePC vector operator, which is an essential component of the non-parametric representation of DeePC. This neural network is trained offline using historical open-loop input and output data of the nonlinear process. With the trained neural network, the Deep DeePC framework is formed for online control implementation. At each sampling instant, this neural network directly outputs the DeePC operator, eliminating the need for online optimization as conventional DeePC. The optimal control action is obtained based on the DeePC operator updated by the trained neural network. To address constrained scenarios, a constraint handling scheme is further proposed and integrated with the Deep DeePC to handle hard constraints during online implementation. The efficacy and superiority of the proposed Deep DeePC approach are demonstrated using two benchmark process examples. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.16338v3-abstract-full').style.display = 'none'; document.getElementById('2408.16338v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">34 pages, 7 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.14170">arXiv:2408.14170</a> <span> [<a href="https://arxiv.org/pdf/2408.14170">pdf</a>, <a href="https://arxiv.org/format/2408.14170">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Image Provenance Analysis via Graph Encoding with Vision Transformer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+K">Keyang Zhang</a>, <a href="/search/eess?searchtype=author&query=Kong%2C+C">Chenqi Kong</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+S">Shiqi Wang</a>, <a href="/search/eess?searchtype=author&query=Rocha%2C+A">Anderson Rocha</a>, <a href="/search/eess?searchtype=author&query=Li%2C+H">Haoliang Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.14170v1-abstract-short" style="display: inline;"> Recent advances in AI-powered image editing tools have significantly lowered the barrier to image modification, raising pressing security concerns those related to spreading misinformation and disinformation on social platforms. Image provenance analysis is crucial in this context, as it identifies relevant images within a database and constructs a relationship graph by mining hidden manipulation… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.14170v1-abstract-full').style.display = 'inline'; document.getElementById('2408.14170v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.14170v1-abstract-full" style="display: none;"> Recent advances in AI-powered image editing tools have significantly lowered the barrier to image modification, raising pressing security concerns those related to spreading misinformation and disinformation on social platforms. Image provenance analysis is crucial in this context, as it identifies relevant images within a database and constructs a relationship graph by mining hidden manipulation and transformation cues, thereby providing concrete evidence chains. This paper introduces a novel end-to-end deep learning framework designed to explore the structural information of provenance graphs. Our proposed method distinguishes from previous approaches in two main ways. First, unlike earlier methods that rely on prior knowledge and have limited generalizability, our framework relies upon a patch attention mechanism to capture image provenance clues for local manipulations and global transformations, thereby enhancing graph construction performance. Second, while previous methods primarily focus on identifying tampering traces only between image pairs, they often overlook the hidden information embedded in the topology of the provenance graph. Our approach aligns the model training objectives with the final graph construction task, incorporating the overall structural information of the graph into the training process. We integrate graph structure information with the attention mechanism, enabling precise determination of the direction of transformation. Experimental results show the superiority of the proposed method over previous approaches, underscoring its effectiveness in addressing the challenges of image provenance analysis. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.14170v1-abstract-full').style.display = 'none'; document.getElementById('2408.14170v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 10 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.09650">arXiv:2408.09650</a> <span> [<a href="https://arxiv.org/pdf/2408.09650">pdf</a>, <a href="https://arxiv.org/format/2408.09650">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> ExpoMamba: Exploiting Frequency SSM Blocks for Efficient and Effective Image Enhancement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Adhikarla%2C+E">Eashan Adhikarla</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+K">Kai Zhang</a>, <a href="/search/eess?searchtype=author&query=Nicholson%2C+J">John Nicholson</a>, <a href="/search/eess?searchtype=author&query=Davison%2C+B+D">Brian D. Davison</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.09650v1-abstract-short" style="display: inline;"> Low-light image enhancement remains a challenging task in computer vision, with existing state-of-the-art models often limited by hardware constraints and computational inefficiencies, particularly in handling high-resolution images. Recent foundation models, such as transformers and diffusion models, despite their efficacy in various domains, are limited in use on edge devices due to their comput… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.09650v1-abstract-full').style.display = 'inline'; document.getElementById('2408.09650v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.09650v1-abstract-full" style="display: none;"> Low-light image enhancement remains a challenging task in computer vision, with existing state-of-the-art models often limited by hardware constraints and computational inefficiencies, particularly in handling high-resolution images. Recent foundation models, such as transformers and diffusion models, despite their efficacy in various domains, are limited in use on edge devices due to their computational complexity and slow inference times. We introduce ExpoMamba, a novel architecture that integrates components of the frequency state space within a modified U-Net, offering a blend of efficiency and effectiveness. This model is specifically optimized to address mixed exposure challenges, a common issue in low-light image enhancement, while ensuring computational efficiency. Our experiments demonstrate that ExpoMamba enhances low-light images up to 2-3x faster than traditional models with an inference time of 36.6 ms and achieves a PSNR improvement of approximately 15-20% over competing models, making it highly suitable for real-time image processing applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.09650v1-abstract-full').style.display = 'none'; document.getElementById('2408.09650v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Efficient Systems for Foundation Models II, International Conference on Machine Learning (ICML) 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.08057">arXiv:2408.08057</a> <span> [<a href="https://arxiv.org/pdf/2408.08057">pdf</a>, <a href="https://arxiv.org/format/2408.08057">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Optimal Joint Fronthaul Compression and Beamforming Design for Networked ISAC Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+K">Kexin Zhang</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+Y">Yanqing Xu</a>, <a href="/search/eess?searchtype=author&query=He%2C+R">Ruisi He</a>, <a href="/search/eess?searchtype=author&query=Shen%2C+C">Chao Shen</a>, <a href="/search/eess?searchtype=author&query=Chang%2C+T">Tsung-hui Chang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.08057v1-abstract-short" style="display: inline;"> This study investigates a networked integrated sensing and communication (ISAC) system, where multiple base stations (BSs), connected to a central processor (CP) via capacity-limited fronthaul links, cooperatively serve communication users while simultaneously sensing a target. The primary objective is to minimize the total transmit power while meeting the signal-to-interference-plus-noise ratio (… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.08057v1-abstract-full').style.display = 'inline'; document.getElementById('2408.08057v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.08057v1-abstract-full" style="display: none;"> This study investigates a networked integrated sensing and communication (ISAC) system, where multiple base stations (BSs), connected to a central processor (CP) via capacity-limited fronthaul links, cooperatively serve communication users while simultaneously sensing a target. The primary objective is to minimize the total transmit power while meeting the signal-to-interference-plus-noise ratio (SINR) requirements for communication and sensing under fronthaul capacity constraints, resulting in a joint fronthaul compression and beamforming design (J-FCBD) problem. We demonstrate that the optimal fronthaul compression variables can be determined in closed form alongside the beamformers, a novel finding in this field. Leveraging this insight, we show that the remaining beamforming design problem can be solved globally using the semidefinite relaxation (SDR) technique, albeit with considerable complexity. Furthermore, the tightness of its SDR reveals zero duality gap between the considered problem and its Lagrangian dual. Building on this duality result, we exploit the novel UL-DL duality within the ISAC framework to develop an efficient primal-dual (PD)-based algorithm. The algorithm alternates between solving beamforming with a fixed dual variable via fixed-point iteration and updating dual variable via bisection, ensuring global optimality and achieving high efficiency due to the computationally inexpensive iterations. Numerical results confirm the global optimality, effectiveness, and efficiency of the proposed PD-based algorithm. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.08057v1-abstract-full').style.display = 'none'; document.getElementById('2408.08057v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.05529">arXiv:2408.05529</a> <span> [<a href="https://arxiv.org/pdf/2408.05529">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Quasi-Fractal UCA Based OAM for Highly Efficient Orthogonal Transmission </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Cheng%2C+W">Wenchi Cheng</a>, <a href="/search/eess?searchtype=author&query=Jing%2C+H">Haiyue Jing</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+W">Wei Zhang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+K">Keyi Zhang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+H">Hailin Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.05529v1-abstract-short" style="display: inline;"> The development of orbital angular momentum (OAM)-based radio vortex transmission presents a promising opportunity for increasing the capacity of wireless communication in correlated channels due to its inherent orthogonality among different OAM modes. One of the most popular schemes for high-efficient OAM transmission is the digital baseband associated with uniform circular array (UCA) based tran… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.05529v1-abstract-full').style.display = 'inline'; document.getElementById('2408.05529v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.05529v1-abstract-full" style="display: none;"> The development of orbital angular momentum (OAM)-based radio vortex transmission presents a promising opportunity for increasing the capacity of wireless communication in correlated channels due to its inherent orthogonality among different OAM modes. One of the most popular schemes for high-efficient OAM transmission is the digital baseband associated with uniform circular array (UCA) based transceiver. However, the periodicity of complex-exponential feed makes the maximum number of orthogonal signals carried by multiple OAM modes generally restricted to the array-element number of UCA antenna, which poses an open question of how to employ more OAM modes given a fixed number of array elements. Furthermore, signals modulated with high-order OAM modes are difficult to be captured by the receiver due to their serious divergence as propagating in free space, thus severely limiting the capacity of radio vortex communications. To overcome the above challenges, in this paper based on the partly element-overlapped fractal geometry layout and effectively using low-order OAM modes, we propose the quasi-fractal UCA (QF-UCA) antenna based OAM multiplexing transmission. We perform the two-dimension OAM modulation (TOM) and demodulation (TOD) schemes with the orthogonal OAM mode number exceeding the array-element number, which is beyond the traditional concept of multiple antennas based wireless communications. Simulation results show that our proposed scheme can achieve more number of orthogonal multiplexing streams than the maximum number of orthogonal multiplexing corresponding to traditional multiple antenna systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.05529v1-abstract-full').style.display = 'none'; document.getElementById('2408.05529v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.05112">arXiv:2408.05112</a> <span> [<a href="https://arxiv.org/pdf/2408.05112">pdf</a>, <a href="https://arxiv.org/format/2408.05112">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Semantic Successive Refinement: A Generative AI-aided Semantic Communication Framework </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+K">Kexin Zhang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+L">Lixin Li</a>, <a href="/search/eess?searchtype=author&query=Lin%2C+W">Wensheng Lin</a>, <a href="/search/eess?searchtype=author&query=Yan%2C+Y">Yuna Yan</a>, <a href="/search/eess?searchtype=author&query=Li%2C+R">Rui Li</a>, <a href="/search/eess?searchtype=author&query=Cheng%2C+W">Wenchi Cheng</a>, <a href="/search/eess?searchtype=author&query=Han%2C+Z">Zhu Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.05112v1-abstract-short" style="display: inline;"> Semantic Communication (SC) is an emerging technology aiming to surpass the Shannon limit. Traditional SC strategies often minimize signal distortion between the original and reconstructed data, neglecting perceptual quality, especially in low Signal-to-Noise Ratio (SNR) environments. To address this issue, we introduce a novel Generative AI Semantic Communication (GSC) system for single-user scen… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.05112v1-abstract-full').style.display = 'inline'; document.getElementById('2408.05112v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.05112v1-abstract-full" style="display: none;"> Semantic Communication (SC) is an emerging technology aiming to surpass the Shannon limit. Traditional SC strategies often minimize signal distortion between the original and reconstructed data, neglecting perceptual quality, especially in low Signal-to-Noise Ratio (SNR) environments. To address this issue, we introduce a novel Generative AI Semantic Communication (GSC) system for single-user scenarios. This system leverages deep generative models to establish a new paradigm in SC. Specifically, At the transmitter end, it employs a joint source-channel coding mechanism based on the Swin Transformer for efficient semantic feature extraction and compression. At the receiver end, an advanced Diffusion Model (DM) reconstructs high-quality images from degraded signals, enhancing perceptual details. Additionally, we present a Multi-User Generative Semantic Communication (MU-GSC) system utilizing an asynchronous processing model. This model effectively manages multiple user requests and optimally utilizes system resources for parallel processing. Simulation results on public datasets demonstrate that our generative AI semantic communication systems achieve superior transmission efficiency and enhanced communication content quality across various channel conditions. Compared to CNN-based DeepJSCC, our methods improve the Peak Signal-to-Noise Ratio (PSNR) by 17.75% in Additive White Gaussian Noise (AWGN) channels and by 20.86% in Rayleigh channels. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.05112v1-abstract-full').style.display = 'none'; document.getElementById('2408.05112v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.21323">arXiv:2407.21323</a> <span> [<a href="https://arxiv.org/pdf/2407.21323">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.3390/tomography10120138">10.3390/tomography10120138 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> STANet: A Novel Spatio-Temporal Aggregation Network for Depression Classification with Small and Unbalanced FMRI Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+W">Wei Zhang</a>, <a href="/search/eess?searchtype=author&query=Zeng%2C+W">Weiming Zeng</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+H">Hongyu Chen</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+J">Jie Liu</a>, <a href="/search/eess?searchtype=author&query=Yan%2C+H">Hongjie Yan</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+K">Kaile Zhang</a>, <a href="/search/eess?searchtype=author&query=Tao%2C+R">Ran Tao</a>, <a href="/search/eess?searchtype=author&query=Siok%2C+W+T">Wai Ting Siok</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+N">Nizhuan Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.21323v2-abstract-short" style="display: inline;"> Accurate diagnosis of depression is crucial for timely implementation of optimal treatments, preventing complications and reducing the risk of suicide. Traditional methods rely on self-report questionnaires and clinical assessment, lacking objective biomarkers. Combining fMRI with artificial intelligence can enhance depression diagnosis by integrating neuroimaging indicators. However, the specific… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.21323v2-abstract-full').style.display = 'inline'; document.getElementById('2407.21323v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.21323v2-abstract-full" style="display: none;"> Accurate diagnosis of depression is crucial for timely implementation of optimal treatments, preventing complications and reducing the risk of suicide. Traditional methods rely on self-report questionnaires and clinical assessment, lacking objective biomarkers. Combining fMRI with artificial intelligence can enhance depression diagnosis by integrating neuroimaging indicators. However, the specificity of fMRI acquisition for depression often results in unbalanced and small datasets, challenging the sensitivity and accuracy of classification models. In this study, we propose the Spatio-Temporal Aggregation Network (STANet) for diagnosing depression by integrating CNN and RNN to capture both temporal and spatial features of brain activity. STANet comprises the following steps:(1) Aggregate spatio-temporal information via ICA. (2) Utilize multi-scale deep convolution to capture detailed features. (3) Balance data using the SMOTE to generate new samples for minority classes. (4) Employ the AFGRU classifier, which combines Fourier transformation with GRU, to capture long-term dependencies, with an adaptive weight assignment mechanism to enhance model generalization. The experimental results demonstrate that STANet achieves superior depression diagnostic performance with 82.38% accuracy and a 90.72% AUC. The STFA module enhances classification by capturing deeper features at multiple scales. The AFGRU classifier, with adaptive weights and stacked GRU, attains higher accuracy and AUC. SMOTE outperforms other oversampling methods. Additionally, spatio-temporal aggregated features achieve better performance compared to using only temporal or spatial features. STANet outperforms traditional or deep learning classifiers, and functional connectivity-based classifiers, as demonstrated by ten-fold cross-validation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.21323v2-abstract-full').style.display = 'none'; document.getElementById('2407.21323v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper is published on Tomography</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Tomography,2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.16066">arXiv:2407.16066</a> <span> [<a href="https://arxiv.org/pdf/2407.16066">pdf</a>, <a href="https://arxiv.org/ps/2407.16066">ps</a>, <a href="https://arxiv.org/format/2407.16066">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Online Reduced-Order Data-Enabled Predictive Control </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Vahidi-Moghaddam%2C+A">Amin Vahidi-Moghaddam</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+K">Kaixiang Zhang</a>, <a href="/search/eess?searchtype=author&query=Yin%2C+X">Xunyuan Yin</a>, <a href="/search/eess?searchtype=author&query=Srivastava%2C+V">Vaibhav Srivastava</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Z">Zhaojian Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.16066v1-abstract-short" style="display: inline;"> Data-enabled predictive control (DeePC) has garnered significant attention for its ability to achieve safe, data-driven optimal control without relying on explicit system models. Traditional DeePC methods use pre-collected input/output (I/O) data to construct Hankel matrices for online predictive control. However, in systems with evolving dynamics or insufficient pre-collected data, incorporating… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.16066v1-abstract-full').style.display = 'inline'; document.getElementById('2407.16066v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.16066v1-abstract-full" style="display: none;"> Data-enabled predictive control (DeePC) has garnered significant attention for its ability to achieve safe, data-driven optimal control without relying on explicit system models. Traditional DeePC methods use pre-collected input/output (I/O) data to construct Hankel matrices for online predictive control. However, in systems with evolving dynamics or insufficient pre-collected data, incorporating real-time data into the DeePC framework becomes crucial to enhance control performance. This paper proposes an online DeePC framework for time-varying systems (i.e., systems with evolving dynamics), enabling the algorithm to update the Hankel matrix online by adding real-time informative signals. By exploiting the minimum non-zero singular value of the Hankel matrix, the developed online DeePC selectively integrates informative data and effectively captures evolving system dynamics. Additionally, a numerical singular value decomposition technique is introduced to reduce the computational complexity for updating a reduced-order Hankel matrix. Simulation results on two cases, a linear time-varying system and the vehicle anti-rollover control, demonstrate the effectiveness of the proposed online reduced-order DeePC framework. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.16066v1-abstract-full').style.display = 'none'; document.getElementById('2407.16066v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.07728">arXiv:2407.07728</a> <span> </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> SaMoye: Zero-shot Singing Voice Conversion Model Based on Feature Disentanglement and Enhancement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+Z">Zihao Wang</a>, <a href="/search/eess?searchtype=author&query=Ma%2C+L">Le Ma</a>, <a href="/search/eess?searchtype=author&query=Feng%2C+Y">Yongsheng Feng</a>, <a href="/search/eess?searchtype=author&query=Pan%2C+X">Xin Pan</a>, <a href="/search/eess?searchtype=author&query=Jin%2C+Y">Yuhang Jin</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+K">Kejun Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.07728v5-abstract-short" style="display: inline;"> Singing voice conversion (SVC) aims to convert a singer's voice to another singer's from a reference audio while keeping the original semantics. However, existing SVC methods can hardly perform zero-shot due to incomplete feature disentanglement or dependence on the speaker look-up table. We propose the first open-source high-quality zero-shot SVC model SaMoye that can convert singing to human and… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.07728v5-abstract-full').style.display = 'inline'; document.getElementById('2407.07728v5-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.07728v5-abstract-full" style="display: none;"> Singing voice conversion (SVC) aims to convert a singer's voice to another singer's from a reference audio while keeping the original semantics. However, existing SVC methods can hardly perform zero-shot due to incomplete feature disentanglement or dependence on the speaker look-up table. We propose the first open-source high-quality zero-shot SVC model SaMoye that can convert singing to human and non-human timbre. SaMoye disentangles the singing voice's features into content, timbre, and pitch features, where we combine multiple ASR models and compress the content features to reduce timbre leaks. Besides, we enhance the timbre features by unfreezing the speaker encoder and mixing the speaker embedding with top-3 similar speakers. We also establish an unparalleled large-scale dataset to guarantee zero-shot performance, which comprises more than 1,815 hours of pure singing voice and 6,367 speakers. We conduct objective and subjective experiments to find that SaMoye outperforms other models in zero-shot SVC tasks even under extreme conditions like converting singing to animals' timbre. The code and weight of SaMoye are available on https://github.com/CarlWangChina/SaMoye-SVC. The weights, code, dataset, and documents of SaMoye are publicly available on \url{https://github.com/CarlWangChina/SaMoye-SVC}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.07728v5-abstract-full').style.display = 'none'; document.getElementById('2407.07728v5-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper needs major changes for resubmit</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68Txx(Primary)14F05; 91Fxx(Secondary) <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.7; J.5 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.04936">arXiv:2407.04936</a> <span> [<a href="https://arxiv.org/pdf/2407.04936">pdf</a>, <a href="https://arxiv.org/format/2407.04936">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> A Reference-free Metric for Language-Queried Audio Source Separation using Contrastive Language-Audio Pretraining </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Xiao%2C+F">Feiyang Xiao</a>, <a href="/search/eess?searchtype=author&query=Guan%2C+J">Jian Guan</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+Q">Qiaoxi Zhu</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+X">Xubo Liu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+W">Wenbo Wang</a>, <a href="/search/eess?searchtype=author&query=Qi%2C+S">Shuhan Qi</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+K">Kejia Zhang</a>, <a href="/search/eess?searchtype=author&query=Sun%2C+J">Jianyuan Sun</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+W">Wenwu Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.04936v2-abstract-short" style="display: inline;"> Language-queried audio source separation (LASS) aims to separate an audio source guided by a text query, with the signal-to-distortion ratio (SDR)-based metrics being commonly used to objectively measure the quality of the separated audio. However, the SDR-based metrics require a reference signal, which is often difficult to obtain in real-world scenarios. In addition, with the SDR-based metrics,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.04936v2-abstract-full').style.display = 'inline'; document.getElementById('2407.04936v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.04936v2-abstract-full" style="display: none;"> Language-queried audio source separation (LASS) aims to separate an audio source guided by a text query, with the signal-to-distortion ratio (SDR)-based metrics being commonly used to objectively measure the quality of the separated audio. However, the SDR-based metrics require a reference signal, which is often difficult to obtain in real-world scenarios. In addition, with the SDR-based metrics, the content information of the text query is not considered effectively in LASS. This paper introduces a reference-free evaluation metric using a contrastive language-audio pretraining (CLAP) module, termed CLAPScore, which measures the semantic similarity between the separated audio and the text query. Unlike SDR, the proposed CLAPScore metric evaluates the quality of the separated audio based on the content information of the text query, without needing a reference signal. Experiments show that the CLAPScore provides an effective evaluation of the semantic relevance of the separated audio to the text query, as compared to the SDR metric, offering an alternative for the performance evaluation of LASS systems. The code for evaluation is publicly available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.04936v2-abstract-full').style.display = 'none'; document.getElementById('2407.04936v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by DCASE 2024 Workshop. GitHub: https://github.com/LittleFlyingSheep/CLAPScore_for_LASS</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.03188">arXiv:2407.03188</a> <span> [<a href="https://arxiv.org/pdf/2407.03188">pdf</a>, <a href="https://arxiv.org/format/2407.03188">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> MuDiT & MuSiT: Alignment with Colloquial Expression in Description-to-Song Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+Z">Zihao Wang</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+H">Haoxuan Liu</a>, <a href="/search/eess?searchtype=author&query=Yu%2C+J">Jiaxing Yu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+T">Tao Zhang</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+Y">Yan Liu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+K">Kejun Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.03188v2-abstract-short" style="display: inline;"> Amid the rising intersection of generative AI and human artistic processes, this study probes the critical yet less-explored terrain of alignment in human-centric automatic song composition. We propose a novel task of Colloquial Description-to-Song Generation, which focuses on aligning the generated content with colloquial human expressions. This task is aimed at bridging the gap between colloquia… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.03188v2-abstract-full').style.display = 'inline'; document.getElementById('2407.03188v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.03188v2-abstract-full" style="display: none;"> Amid the rising intersection of generative AI and human artistic processes, this study probes the critical yet less-explored terrain of alignment in human-centric automatic song composition. We propose a novel task of Colloquial Description-to-Song Generation, which focuses on aligning the generated content with colloquial human expressions. This task is aimed at bridging the gap between colloquial language understanding and auditory expression within an AI model, with the ultimate goal of creating songs that accurately satisfy human auditory expectations and structurally align with musical norms. Current datasets are limited due to their narrow descriptive scope, semantic gaps and inaccuracies. To overcome data scarcity in this domain, we present the Caichong Music Dataset (CaiMD). CaiMD is manually annotated by both professional musicians and amateurs, offering diverse perspectives and a comprehensive understanding of colloquial descriptions. Unlike existing datasets pre-set with expert annotations or auto-generated ones with inherent biases, CaiMD caters more sufficiently to our purpose of aligning AI-generated music with widespread user-desired results. Moreover, we propose an innovative single-stage framework called MuDiT/MuSiT for enabling effective human-machine alignment in song creation. This framework not only achieves cross-modal comprehension between colloquial language and auditory music perceptions but also ensures generated songs align with user-desired results. MuDiT/MuSiT employs one DiT/SiT model for end-to-end generation of musical components like melody, harmony, rhythm, vocals, and instrumentation. The approach ensures harmonious sonic cohesiveness amongst all generated musical components, facilitating better resonance with human auditory expectations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.03188v2-abstract-full').style.display = 'none'; document.getElementById('2407.03188v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">19 pages, 5 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68Txx(Primary)14F05; 91Fxx(Secondary) <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.7; J.5 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.00529">arXiv:2407.00529</a> <span> [<a href="https://arxiv.org/pdf/2407.00529">pdf</a>, <a href="https://arxiv.org/format/2407.00529">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Statistics Theory">math.ST</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Detecting and Identifying Selection Structure in Sequential Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zheng%2C+Y">Yujia Zheng</a>, <a href="/search/eess?searchtype=author&query=Tang%2C+Z">Zeyu Tang</a>, <a href="/search/eess?searchtype=author&query=Qiu%2C+Y">Yiwen Qiu</a>, <a href="/search/eess?searchtype=author&query=Sch%C3%B6lkopf%2C+B">Bernhard Sch枚lkopf</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+K">Kun Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.00529v1-abstract-short" style="display: inline;"> We argue that the selective inclusion of data points based on latent objectives is common in practical situations, such as music sequences. Since this selection process often distorts statistical analysis, previous work primarily views it as a bias to be corrected and proposes various methods to mitigate its effect. However, while controlling this bias is crucial, selection also offers an opportun… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.00529v1-abstract-full').style.display = 'inline'; document.getElementById('2407.00529v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.00529v1-abstract-full" style="display: none;"> We argue that the selective inclusion of data points based on latent objectives is common in practical situations, such as music sequences. Since this selection process often distorts statistical analysis, previous work primarily views it as a bias to be corrected and proposes various methods to mitigate its effect. However, while controlling this bias is crucial, selection also offers an opportunity to provide a deeper insight into the hidden generation process, as it is a fundamental mechanism underlying what we observe. In particular, overlooking selection in sequential data can lead to an incomplete or overcomplicated inductive bias in modeling, such as assuming a universal autoregressive structure for all dependencies. Therefore, rather than merely viewing it as a bias, we explore the causal structure of selection in sequential data to delve deeper into the complete causal process. Specifically, we show that selection structure is identifiable without any parametric assumptions or interventional experiments. Moreover, even in cases where selection variables coexist with latent confounders, we still establish the nonparametric identifiability under appropriate structural conditions. Meanwhile, we also propose a provably correct algorithm to detect and identify selection structures as well as other types of dependencies. The framework has been validated empirically on both synthetic data and real-world music. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.00529v1-abstract-full').style.display = 'none'; document.getElementById('2407.00529v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICML 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.18690">arXiv:2405.18690</a> <span> [<a href="https://arxiv.org/pdf/2405.18690">pdf</a>, <a href="https://arxiv.org/format/2405.18690">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Differentially-Private Distributed Model Predictive Control of Linear Discrete-Time Systems with Global Constraints </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+K">Kaixiang Zhang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yongqiang Wang</a>, <a href="/search/eess?searchtype=author&query=Song%2C+Z">Ziyou Song</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Z">Zhaojian Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.18690v2-abstract-short" style="display: inline;"> Distributed model predictive control (DMPC) has attracted extensive attention as it can explicitly handle system constraints and achieve optimal control in a decentralized manner. However, the deployment of DMPC strategies generally requires the sharing of sensitive data among subsystems, which may violate the privacy of participating systems. In this paper, we propose a differentially-private DMP… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.18690v2-abstract-full').style.display = 'inline'; document.getElementById('2405.18690v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.18690v2-abstract-full" style="display: none;"> Distributed model predictive control (DMPC) has attracted extensive attention as it can explicitly handle system constraints and achieve optimal control in a decentralized manner. However, the deployment of DMPC strategies generally requires the sharing of sensitive data among subsystems, which may violate the privacy of participating systems. In this paper, we propose a differentially-private DMPC algorithm for linear discrete-time systems subject to coupled global constraints. Specifically, we first show that a conventional distributed dual gradient algorithm can be used to address the considered DMPC problem but cannot provide strong privacy preservation. Then, to protect privacy against the eavesdropper, we incorporate a differential-privacy noise injection mechanism into the DMPC framework and prove that the resulting distributed optimization algorithm can ensure both provable convergence to a global optimal solution and rigorous $蔚$-differential privacy. In addition, an implementation strategy of the DMPC is designed such that the recursive feasibility and stability of the closed-loop system are guaranteed. Simulation results are provided to demonstrate the effectiveness of the developed approach. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.18690v2-abstract-full').style.display = 'none'; document.getElementById('2405.18690v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 3 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.10561">arXiv:2405.10561</a> <span> [<a href="https://arxiv.org/pdf/2405.10561">pdf</a>, <a href="https://arxiv.org/format/2405.10561">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Infrared Image Super-Resolution via Lightweight Information Split Network </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Liu%2C+S">Shijie Liu</a>, <a href="/search/eess?searchtype=author&query=Yan%2C+K">Kang Yan</a>, <a href="/search/eess?searchtype=author&query=Qin%2C+F">Feiwei Qin</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+C">Changmiao Wang</a>, <a href="/search/eess?searchtype=author&query=Ge%2C+R">Ruiquan Ge</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+K">Kai Zhang</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+J">Jie Huang</a>, <a href="/search/eess?searchtype=author&query=Peng%2C+Y">Yong Peng</a>, <a href="/search/eess?searchtype=author&query=Cao%2C+J">Jin Cao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.10561v3-abstract-short" style="display: inline;"> Single image super-resolution (SR) is an established pixel-level vision task aimed at reconstructing a high-resolution image from its degraded low-resolution counterpart. Despite the notable advancements achieved by leveraging deep neural networks for SR, most existing deep learning architectures feature an extensive number of layers, leading to high computational complexity and substantial memory… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.10561v3-abstract-full').style.display = 'inline'; document.getElementById('2405.10561v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.10561v3-abstract-full" style="display: none;"> Single image super-resolution (SR) is an established pixel-level vision task aimed at reconstructing a high-resolution image from its degraded low-resolution counterpart. Despite the notable advancements achieved by leveraging deep neural networks for SR, most existing deep learning architectures feature an extensive number of layers, leading to high computational complexity and substantial memory demands. These issues become particularly pronounced in the context of infrared image SR, where infrared devices often have stringent storage and computational constraints. To mitigate these challenges, we introduce a novel, efficient, and precise single infrared image SR model, termed the Lightweight Information Split Network (LISN). The LISN comprises four main components: shallow feature extraction, deep feature extraction, dense feature fusion, and high-resolution infrared image reconstruction. A key innovation within this model is the introduction of the Lightweight Information Split Block (LISB) for deep feature extraction. The LISB employs a sequential process to extract hierarchical features, which are then aggregated based on the relevance of the features under consideration. By integrating channel splitting and shift operations, the LISB successfully strikes an optimal balance between enhanced SR performance and a lightweight framework. Comprehensive experimental evaluations reveal that the proposed LISN achieves superior performance over contemporary state-of-the-art methods in terms of both SR quality and model complexity, affirming its efficacy for practical deployment in resource-constrained infrared imaging applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.10561v3-abstract-full').style.display = 'none'; document.getElementById('2405.10561v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.10513">arXiv:2405.10513</a> <span> [<a href="https://arxiv.org/pdf/2405.10513">pdf</a>, <a href="https://arxiv.org/format/2405.10513">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Federated Learning With Energy Harvesting Devices: An MDP Framework </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+K">Kai Zhang</a>, <a href="/search/eess?searchtype=author&query=Cao%2C+X">Xuanyu Cao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.10513v1-abstract-short" style="display: inline;"> Federated learning (FL) requires edge devices to perform local training and exchange information with a parameter server, leading to substantial energy consumption. A critical challenge in practical FL systems is the rapid energy depletion of battery-limited edge devices, which curtails their operational lifespan and affects the learning performance. To address this issue, we apply energy harvesti… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.10513v1-abstract-full').style.display = 'inline'; document.getElementById('2405.10513v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.10513v1-abstract-full" style="display: none;"> Federated learning (FL) requires edge devices to perform local training and exchange information with a parameter server, leading to substantial energy consumption. A critical challenge in practical FL systems is the rapid energy depletion of battery-limited edge devices, which curtails their operational lifespan and affects the learning performance. To address this issue, we apply energy harvesting technique in FL systems to extract ambient energy for continuously powering edge devices. We first establish the convergence bound for the wireless FL system with energy harvesting devices, illustrating that the convergence is impacted by partial device participation and packet drops, both of which depend on the energy supply. To accelerate the convergence, we formulate a joint device scheduling and power control problem and model it as a Markov decision process (MDP). By solving this MDP, we derive the optimal transmission policy and demonstrate that it possesses a monotone structure with respect to the battery and channel states. To overcome the curse of dimensionality caused by the exponential complexity of computing the optimal policy, we propose a low-complexity algorithm, which is asymptotically optimal as the number of devices increases. Furthermore, for unknown channels and harvested energy statistics, we develop a structure-enhanced deep reinforcement learning algorithm that leverages the monotone structure of the optimal policy to improve the training performance. Finally, extensive numerical experiments on real-world datasets are presented to validate the theoretical results and corroborate the effectiveness of the proposed algorithms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.10513v1-abstract-full').style.display = 'none'; document.getElementById('2405.10513v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.19547">arXiv:2404.19547</a> <span> [<a href="https://arxiv.org/pdf/2404.19547">pdf</a>, <a href="https://arxiv.org/format/2404.19547">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> </div> </div> <p class="title is-5 mathjax"> Distributed Traffic Signal Control via Coordinated Maximum Pressure-plus-Penalty </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=T%C3%BCtsch%2C+V">Vinzenz T眉tsch</a>, <a href="/search/eess?searchtype=author&query=He%2C+Z">Zhiyu He</a>, <a href="/search/eess?searchtype=author&query=D%C3%B6rfler%2C+F">Florian D枚rfler</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+K">Kenan Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.19547v1-abstract-short" style="display: inline;"> This paper develops an adaptive traffic control policy inspired by Maximum Pressure (MP) while imposing coordination across intersections. The proposed Coordinated Maximum Pressure-plus-Penalty (CMPP) control policy features a local objective for each intersection that consists of the total pressure within the neighborhood and a penalty accounting for the queue capacities and continuous green time… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.19547v1-abstract-full').style.display = 'inline'; document.getElementById('2404.19547v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.19547v1-abstract-full" style="display: none;"> This paper develops an adaptive traffic control policy inspired by Maximum Pressure (MP) while imposing coordination across intersections. The proposed Coordinated Maximum Pressure-plus-Penalty (CMPP) control policy features a local objective for each intersection that consists of the total pressure within the neighborhood and a penalty accounting for the queue capacities and continuous green time for certain movements. The corresponding control task is reformulated as a distributed optimization problem and solved via two customized algorithms: one based on the alternating direction method of multipliers (ADMM) and the other follows a greedy heuristic augmented with a majority vote. CMPP not only provides a theoretical guarantee of queuing network stability but also outperforms several benchmark controllers in simulations on a large-scale real traffic network with lower average travel and waiting time per vehicle, as well as less network congestion. Furthermore, CPMM with the greedy algorithm enjoys comparable computational efficiency as fully decentralized controllers without significantly compromising the control performance, which highlights its great potential for real-world deployment. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.19547v1-abstract-full').style.display = 'none'; document.getElementById('2404.19547v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.17755">arXiv:2404.17755</a> <span> [<a href="https://arxiv.org/pdf/2404.17755">pdf</a>, <a href="https://arxiv.org/ps/2404.17755">ps</a>, <a href="https://arxiv.org/format/2404.17755">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Dual-Functional Waveform Design with Local Sidelobe Suppression via OTFS Signaling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+K">Kecheng Zhang</a>, <a href="/search/eess?searchtype=author&query=Yuan%2C+W">Weijie Yuan</a>, <a href="/search/eess?searchtype=author&query=Fan%2C+P">Pingzhi Fan</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+X">Xianbin Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.17755v2-abstract-short" style="display: inline;"> Integrated sensing and communication (ISAC) is viewed as a key technology in future wireless networks. One of the main challenges in realizing ISAC is developing dual-functional waveforms that can communicate with communication receivers and perform radar sensing simultaneously. In this paper, we consider the joint design of a dual-functional orthogonal time-frequency space (OTFS) signal and a rec… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.17755v2-abstract-full').style.display = 'inline'; document.getElementById('2404.17755v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.17755v2-abstract-full" style="display: none;"> Integrated sensing and communication (ISAC) is viewed as a key technology in future wireless networks. One of the main challenges in realizing ISAC is developing dual-functional waveforms that can communicate with communication receivers and perform radar sensing simultaneously. In this paper, we consider the joint design of a dual-functional orthogonal time-frequency space (OTFS) signal and a receiving filter for the ISAC system. The problem of ISAC waveform design is formulated as the minimization of the weighted integrated sidelobe level (WISL) of the ambiguity function and the interference term from ISAC waveform, with constraints on signal-to-noise ratio loss. The majorization-minimization algorithm combined with alternating iterative minimization is implemented to solve the optimization problem. Simulation results show that the WISL and the interference term can be significantly decreased to guarantee achievable data rates and detection performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.17755v2-abstract-full').style.display = 'none'; document.getElementById('2404.17755v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted as a Correspondence by IEEE Transactions on Vehicular Technology. 6 pages, 4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.13748">arXiv:2404.13748</a> <span> [<a href="https://arxiv.org/pdf/2404.13748">pdf</a>, <a href="https://arxiv.org/format/2404.13748">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Numerical Analysis">math.NA</span> </div> </div> <p class="title is-5 mathjax"> Application of Kalman Filter in Stochastic Differential Equations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Bao%2C+W">Wencheng Bao</a>, <a href="/search/eess?searchtype=author&query=Feng%2C+S">Shi Feng</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+K">Kaiwen Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.13748v1-abstract-short" style="display: inline;"> In areas such as finance, engineering, and science, we often face situations that change quickly and unpredictably. These situations are tough to handle and require special tools and methods capable of understanding and predicting what might happen next. Stochastic Differential Equations (SDEs) are renowned for modeling and analyzing real-world dynamical systems. However, obtaining the parameters,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.13748v1-abstract-full').style.display = 'inline'; document.getElementById('2404.13748v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.13748v1-abstract-full" style="display: none;"> In areas such as finance, engineering, and science, we often face situations that change quickly and unpredictably. These situations are tough to handle and require special tools and methods capable of understanding and predicting what might happen next. Stochastic Differential Equations (SDEs) are renowned for modeling and analyzing real-world dynamical systems. However, obtaining the parameters, boundary conditions, and closed-form solutions of SDEs can often be challenging. In this paper, we will discuss the application of Kalman filtering theory to SDEs, including Extended Kalman filtering and Particle Extended Kalman filtering. We will explore how to fit existing SDE systems through filtering and track the original SDEs by fitting the obtained closed-form solutions. This approach aims to gather more information about these SDEs, which could be used in various ways, such as incorporating them into parameters of data-based SDE models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.13748v1-abstract-full').style.display = 'none'; document.getElementById('2404.13748v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18 pages, 14 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.07341">arXiv:2404.07341</a> <span> [<a href="https://arxiv.org/pdf/2404.07341">pdf</a>, <a href="https://arxiv.org/format/2404.07341">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Conformer-1: Robust ASR via Large-Scale Semisupervised Bootstrapping </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+K">Kevin Zhang</a>, <a href="/search/eess?searchtype=author&query=Chkhetiani%2C+L">Luka Chkhetiani</a>, <a href="/search/eess?searchtype=author&query=Ramirez%2C+F+M">Francis McCann Ramirez</a>, <a href="/search/eess?searchtype=author&query=Khare%2C+Y">Yash Khare</a>, <a href="/search/eess?searchtype=author&query=Vanzo%2C+A">Andrea Vanzo</a>, <a href="/search/eess?searchtype=author&query=Liang%2C+M">Michael Liang</a>, <a href="/search/eess?searchtype=author&query=Martin%2C+S+R">Sergio Ramirez Martin</a>, <a href="/search/eess?searchtype=author&query=Oexle%2C+G">Gabriel Oexle</a>, <a href="/search/eess?searchtype=author&query=Bousbib%2C+R">Ruben Bousbib</a>, <a href="/search/eess?searchtype=author&query=Peyash%2C+T">Taufiquzzaman Peyash</a>, <a href="/search/eess?searchtype=author&query=Nguyen%2C+M">Michael Nguyen</a>, <a href="/search/eess?searchtype=author&query=Pulliam%2C+D">Dillon Pulliam</a>, <a href="/search/eess?searchtype=author&query=Donato%2C+D">Domenic Donato</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.07341v2-abstract-short" style="display: inline;"> This paper presents Conformer-1, an end-to-end Automatic Speech Recognition (ASR) model trained on an extensive dataset of 570k hours of speech audio data, 91% of which was acquired from publicly available sources. To achieve this, we perform Noisy Student Training after generating pseudo-labels for the unlabeled public data using a strong Conformer RNN-T baseline model. The addition of these pseu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.07341v2-abstract-full').style.display = 'inline'; document.getElementById('2404.07341v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.07341v2-abstract-full" style="display: none;"> This paper presents Conformer-1, an end-to-end Automatic Speech Recognition (ASR) model trained on an extensive dataset of 570k hours of speech audio data, 91% of which was acquired from publicly available sources. To achieve this, we perform Noisy Student Training after generating pseudo-labels for the unlabeled public data using a strong Conformer RNN-T baseline model. The addition of these pseudo-labeled data results in remarkable improvements in relative Word Error Rate (WER) by 11.5% and 24.3% for our asynchronous and realtime models, respectively. Additionally, the model is more robust to background noise owing to the addition of these data. The results obtained in this study demonstrate that the incorporation of pseudo-labeled publicly available data is a highly effective strategy for improving ASR accuracy and noise robustness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.07341v2-abstract-full').style.display = 'none'; document.getElementById('2404.07341v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.01717">arXiv:2404.01717</a> <span> [<a href="https://arxiv.org/pdf/2404.01717">pdf</a>, <a href="https://arxiv.org/format/2404.01717">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> AddSR: Accelerating Diffusion-based Blind Super-Resolution with Adversarial Diffusion Distillation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Xie%2C+R">Rui Xie</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+C">Chen Zhao</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+K">Kai Zhang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Z">Zhenyu Zhang</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+J">Jun Zhou</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+J">Jian Yang</a>, <a href="/search/eess?searchtype=author&query=Tai%2C+Y">Ying Tai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.01717v4-abstract-short" style="display: inline;"> Blind super-resolution methods based on stable diffusion showcase formidable generative capabilities in reconstructing clear high-resolution images with intricate details from low-resolution inputs. However, their practical applicability is often hampered by poor efficiency, stemming from the requirement of thousands or hundreds of sampling steps. Inspired by the efficient adversarial diffusion di… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.01717v4-abstract-full').style.display = 'inline'; document.getElementById('2404.01717v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.01717v4-abstract-full" style="display: none;"> Blind super-resolution methods based on stable diffusion showcase formidable generative capabilities in reconstructing clear high-resolution images with intricate details from low-resolution inputs. However, their practical applicability is often hampered by poor efficiency, stemming from the requirement of thousands or hundreds of sampling steps. Inspired by the efficient adversarial diffusion distillation (ADD), we design~\name~to address this issue by incorporating the ideas of both distillation and ControlNet. Specifically, we first propose a prediction-based self-refinement strategy to provide high-frequency information in the student model output with marginal additional time cost. Furthermore, we refine the training process by employing HR images, rather than LR images, to regulate the teacher model, providing a more robust constraint for distillation. Second, we introduce a timestep-adaptive ADD to address the perception-distortion imbalance problem introduced by original ADD. Extensive experiments demonstrate our~\name~generates better restoration results, while achieving faster speed than previous SD-based state-of-the-art models (e.g., $7$$\times$ faster than SeeSR). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.01717v4-abstract-full').style.display = 'none'; document.getElementById('2404.01717v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.11883">arXiv:2403.11883</a> <span> [<a href="https://arxiv.org/pdf/2403.11883">pdf</a>, <a href="https://arxiv.org/ps/2403.11883">ps</a>, <a href="https://arxiv.org/format/2403.11883">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Data-Enabled Predictive Iterative Control </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+K">Kai Zhang</a>, <a href="/search/eess?searchtype=author&query=Zuliani%2C+R">Riccardo Zuliani</a>, <a href="/search/eess?searchtype=author&query=Balta%2C+E+C">Efe C. Balta</a>, <a href="/search/eess?searchtype=author&query=Lygeros%2C+J">John Lygeros</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.11883v2-abstract-short" style="display: inline;"> This work introduces the Data-Enabled Predictive iteRative Control (DeePRC) algorithm, a direct data-driven approach for iterative LTI systems. The DeePRC learns from previous iterations to improve its performance and achieves the optimal cost. By utilizing a tube-based variation of the DeePRC scheme, we propose a two-stage approach that enables safe active exploration using a left-kernel-based in… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.11883v2-abstract-full').style.display = 'inline'; document.getElementById('2403.11883v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.11883v2-abstract-full" style="display: none;"> This work introduces the Data-Enabled Predictive iteRative Control (DeePRC) algorithm, a direct data-driven approach for iterative LTI systems. The DeePRC learns from previous iterations to improve its performance and achieves the optimal cost. By utilizing a tube-based variation of the DeePRC scheme, we propose a two-stage approach that enables safe active exploration using a left-kernel-based input disturbance design. This method generates informative trajectories to enrich the historical data, which extends the maximum achievable prediction horizon and leads to faster iteration convergence. In addition, we present an end-to-end formulation of the two-stage approach, integrating the disturbance design procedure into the planning phase. We showcase the effectiveness of the proposed algorithms on a numerical experiment. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.11883v2-abstract-full').style.display = 'none'; document.getElementById('2403.11883v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.05906">arXiv:2403.05906</a> <span> [<a href="https://arxiv.org/pdf/2403.05906">pdf</a>, <a href="https://arxiv.org/format/2403.05906">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Segmentation Guided Sparse Transformer for Under-Display Camera Image Restoration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Xue%2C+J">Jingyun Xue</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+T">Tao Wang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+J">Jun Wang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+K">Kaihao Zhang</a>, <a href="/search/eess?searchtype=author&query=Luo%2C+W">Wenhan Luo</a>, <a href="/search/eess?searchtype=author&query=Ren%2C+W">Wenqi Ren</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+Z">Zikun Liu</a>, <a href="/search/eess?searchtype=author&query=Park%2C+H">Hyunhee Park</a>, <a href="/search/eess?searchtype=author&query=Cao%2C+X">Xiaochun Cao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.05906v1-abstract-short" style="display: inline;"> Under-Display Camera (UDC) is an emerging technology that achieves full-screen display via hiding the camera under the display panel. However, the current implementation of UDC causes serious degradation. The incident light required for camera imaging undergoes attenuation and diffraction when passing through the display panel, leading to various artifacts in UDC imaging. Presently, the prevailing… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.05906v1-abstract-full').style.display = 'inline'; document.getElementById('2403.05906v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.05906v1-abstract-full" style="display: none;"> Under-Display Camera (UDC) is an emerging technology that achieves full-screen display via hiding the camera under the display panel. However, the current implementation of UDC causes serious degradation. The incident light required for camera imaging undergoes attenuation and diffraction when passing through the display panel, leading to various artifacts in UDC imaging. Presently, the prevailing UDC image restoration methods predominantly utilize convolutional neural network architectures, whereas Transformer-based methods have exhibited superior performance in the majority of image restoration tasks. This is attributed to the Transformer's capability to sample global features for the local reconstruction of images, thereby achieving high-quality image restoration. In this paper, we observe that when using the Vision Transformer for UDC degraded image restoration, the global attention samples a large amount of redundant information and noise. Furthermore, compared to the ordinary Transformer employing dense attention, the Transformer utilizing sparse attention can alleviate the adverse impact of redundant information and noise. Building upon this discovery, we propose a Segmentation Guided Sparse Transformer method (SGSFormer) for the task of restoring high-quality images from UDC degraded images. Specifically, we utilize sparse self-attention to filter out redundant information and noise, directing the model's attention to focus on the features more relevant to the degraded regions in need of reconstruction. Moreover, we integrate the instance segmentation map as prior information to guide the sparse self-attention in filtering and focusing on the correct regions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.05906v1-abstract-full').style.display = 'none'; document.getElementById('2403.05906v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 10 figures, conference or other essential info</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.13629">arXiv:2402.13629</a> <span> </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Adversarial Purification and Fine-tuning for Robust UDC Image Restoration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Song%2C+Z">Zhenbo Song</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Z">Zhenyuan Zhang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+K">Kaihao Zhang</a>, <a href="/search/eess?searchtype=author&query=Fan%2C+Z">Zhaoxin Fan</a>, <a href="/search/eess?searchtype=author&query=Lu%2C+J">Jianfeng Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.13629v3-abstract-short" style="display: inline;"> This study delves into the enhancement of Under-Display Camera (UDC) image restoration models, focusing on their robustness against adversarial attacks. Despite its innovative approach to seamless display integration, UDC technology faces unique image degradation challenges exacerbated by the susceptibility to adversarial perturbations. Our research initially conducts an in-depth robustness evalua… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.13629v3-abstract-full').style.display = 'inline'; document.getElementById('2402.13629v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.13629v3-abstract-full" style="display: none;"> This study delves into the enhancement of Under-Display Camera (UDC) image restoration models, focusing on their robustness against adversarial attacks. Despite its innovative approach to seamless display integration, UDC technology faces unique image degradation challenges exacerbated by the susceptibility to adversarial perturbations. Our research initially conducts an in-depth robustness evaluation of deep-learning-based UDC image restoration models by employing several white-box and black-box attacking methods. This evaluation is pivotal in understanding the vulnerabilities of current UDC image restoration techniques. Following the assessment, we introduce a defense framework integrating adversarial purification with subsequent fine-tuning processes. First, our approach employs diffusion-based adversarial purification, effectively neutralizing adversarial perturbations. Then, we apply the fine-tuning methodologies to refine the image restoration models further, ensuring that the quality and fidelity of the restored images are maintained. The effectiveness of our proposed approach is validated through extensive experiments, showing marked improvements in resilience against typical adversarial attacks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.13629v3-abstract-full').style.display = 'none'; document.getElementById('2402.13629v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Failure to meet expectations</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.09871">arXiv:2402.09871</a> <span> [<a href="https://arxiv.org/pdf/2402.09871">pdf</a>, <a href="https://arxiv.org/format/2402.09871">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> MuChin: A Chinese Colloquial Description Benchmark for Evaluating Language Models in the Field of Music </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+Z">Zihao Wang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+S">Shuyu Li</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+T">Tao Zhang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Q">Qi Wang</a>, <a href="/search/eess?searchtype=author&query=Yu%2C+P">Pengfei Yu</a>, <a href="/search/eess?searchtype=author&query=Luo%2C+J">Jinyang Luo</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+Y">Yan Liu</a>, <a href="/search/eess?searchtype=author&query=Xi%2C+M">Ming Xi</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+K">Kejun Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.09871v4-abstract-short" style="display: inline;"> The rapidly evolving multimodal Large Language Models (LLMs) urgently require new benchmarks to uniformly evaluate their performance on understanding and textually describing music. However, due to semantic gaps between Music Information Retrieval (MIR) algorithms and human understanding, discrepancies between professionals and the public, and low precision of annotations, existing music descripti… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.09871v4-abstract-full').style.display = 'inline'; document.getElementById('2402.09871v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.09871v4-abstract-full" style="display: none;"> The rapidly evolving multimodal Large Language Models (LLMs) urgently require new benchmarks to uniformly evaluate their performance on understanding and textually describing music. However, due to semantic gaps between Music Information Retrieval (MIR) algorithms and human understanding, discrepancies between professionals and the public, and low precision of annotations, existing music description datasets cannot serve as benchmarks. To this end, we present MuChin, the first open-source music description benchmark in Chinese colloquial language, designed to evaluate the performance of multimodal LLMs in understanding and describing music. We established the Caichong Music Annotation Platform (CaiMAP) that employs an innovative multi-person, multi-stage assurance method, and recruited both amateurs and professionals to ensure the precision of annotations and alignment with popular semantics. Utilizing this method, we built a dataset with multi-dimensional, high-precision music annotations, the Caichong Music Dataset (CaiMD), and carefully selected 1,000 high-quality entries to serve as the test set for MuChin. Based on MuChin, we analyzed the discrepancies between professionals and amateurs in terms of music description, and empirically demonstrated the effectiveness of annotated data for fine-tuning LLMs. Ultimately, we employed MuChin to evaluate existing music understanding models on their ability to provide colloquial descriptions of music. All data related to the benchmark, along with the scoring code and detailed appendices, have been open-sourced (https://github.com/CarlWangChina/MuChin/). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.09871v4-abstract-full').style.display = 'none'; document.getElementById('2402.09871v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by International Joint Conference on Artificial Intelligence 2024 (IJCAI 2024)</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68Txx(Primary)14F05; 91Fxx(Secondary) <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.7; J.5 </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Zhang%2C+K&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Zhang%2C+K&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+K&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+K&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+K&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+K&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>