Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 143 results for author: <span class="mathjax">Yang, W</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/eess" aria-role="search"> Searching in archive <strong>eess</strong>. <a href="/search/?searchtype=author&query=Yang%2C+W">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Yang, W"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Yang%2C+W&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Yang, W"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Yang%2C+W&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Yang%2C+W&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Yang%2C+W&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Yang%2C+W&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14842">arXiv:2411.14842</a> <span> [<a href="https://arxiv.org/pdf/2411.14842">pdf</a>, <a href="https://arxiv.org/format/2411.14842">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Who Can Withstand Chat-Audio Attacks? An Evaluation Benchmark for Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yang%2C+W">Wanqi Yang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yanda Li</a>, <a href="/search/eess?searchtype=author&query=Fang%2C+M">Meng Fang</a>, <a href="/search/eess?searchtype=author&query=Wei%2C+Y">Yunchao Wei</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+T">Tianyi Zhou</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+L">Ling Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14842v1-abstract-short" style="display: inline;"> Adversarial audio attacks pose a significant threat to the growing use of large language models (LLMs) in voice-based human-machine interactions. While existing research has primarily focused on model-specific adversarial methods, real-world applications demand a more generalizable and universal approach to audio adversarial attacks. In this paper, we introduce the Chat-Audio Attacks (CAA) benchma… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14842v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14842v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14842v1-abstract-full" style="display: none;"> Adversarial audio attacks pose a significant threat to the growing use of large language models (LLMs) in voice-based human-machine interactions. While existing research has primarily focused on model-specific adversarial methods, real-world applications demand a more generalizable and universal approach to audio adversarial attacks. In this paper, we introduce the Chat-Audio Attacks (CAA) benchmark including four distinct types of audio attacks, which aims to explore the the vulnerabilities of LLMs to these audio attacks in conversational scenarios. To evaluate the robustness of LLMs, we propose three evaluation strategies: Standard Evaluation, utilizing traditional metrics to quantify model performance under attacks; GPT-4o-Based Evaluation, which simulates real-world conversational complexities; and Human Evaluation, offering insights into user perception and trust. We evaluate six state-of-the-art LLMs with voice interaction capabilities, including Gemini-1.5-Pro, GPT-4o, and others, using three distinct evaluation methods on the CAA benchmark. Our comprehensive analysis reveals the impact of four types of audio attacks on the performance of these models, demonstrating that GPT-4o exhibits the highest level of resilience. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14842v1-abstract-full').style.display = 'none'; document.getElementById('2411.14842v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14816">arXiv:2411.14816</a> <span> [<a href="https://arxiv.org/pdf/2411.14816">pdf</a>, <a href="https://arxiv.org/format/2411.14816">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Unsupervised Multi-view UAV Image Geo-localization via Iterative Rendering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+H">Haoyuan Li</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+C">Chang Xu</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+W">Wen Yang</a>, <a href="/search/eess?searchtype=author&query=Mi%2C+L">Li Mi</a>, <a href="/search/eess?searchtype=author&query=Yu%2C+H">Huai Yu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+H">Haijian Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14816v1-abstract-short" style="display: inline;"> Unmanned Aerial Vehicle (UAV) Cross-View Geo-Localization (CVGL) presents significant challenges due to the view discrepancy between oblique UAV images and overhead satellite images. Existing methods heavily rely on the supervision of labeled datasets to extract viewpoint-invariant features for cross-view retrieval. However, these methods have expensive training costs and tend to overfit the regio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14816v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14816v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14816v1-abstract-full" style="display: none;"> Unmanned Aerial Vehicle (UAV) Cross-View Geo-Localization (CVGL) presents significant challenges due to the view discrepancy between oblique UAV images and overhead satellite images. Existing methods heavily rely on the supervision of labeled datasets to extract viewpoint-invariant features for cross-view retrieval. However, these methods have expensive training costs and tend to overfit the region-specific cues, showing limited generalizability to new regions. To overcome this issue, we propose an unsupervised solution that lifts the scene representation to 3d space from UAV observations for satellite image generation, providing robust representation against view distortion. By generating orthogonal images that closely resemble satellite views, our method reduces view discrepancies in feature representation and mitigates shortcuts in region-specific image pairing. To further align the rendered image's perspective with the real one, we design an iterative camera pose updating mechanism that progressively modulates the rendered query image with potential satellite targets, eliminating spatial offsets relative to the reference images. Additionally, this iterative refinement strategy enhances cross-view feature invariance through view-consistent fusion across iterations. As such, our unsupervised paradigm naturally avoids the problem of region-specific overfitting, enabling generic CVGL for UAV images without feature fine-tuning or data-driven training. Experiments on the University-1652 and SUES-200 datasets demonstrate that our approach significantly improves geo-localization accuracy while maintaining robustness across diverse regions. Notably, without model fine-tuning or paired training, our method achieves competitive performance with recent supervised methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14816v1-abstract-full').style.display = 'none'; document.getElementById('2411.14816v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23577">arXiv:2410.23577</a> <span> [<a href="https://arxiv.org/pdf/2410.23577">pdf</a>, <a href="https://arxiv.org/format/2410.23577">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MS-Glance: Bio-Insipred Non-semantic Context Vectors and their Applications in Supervising Image Reconstruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Gao%2C+Z">Ziqi Gao</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+W">Wendi Yang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yujia Li</a>, <a href="/search/eess?searchtype=author&query=Xing%2C+L">Lei Xing</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+S+K">S. Kevin Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23577v2-abstract-short" style="display: inline;"> Non-semantic context information is crucial for visual recognition, as the human visual perception system first uses global statistics to process scenes rapidly before identifying specific objects. However, while semantic information is increasingly incorporated into computer vision tasks such as image reconstruction, non-semantic information, such as global spatial structures, is often overlooked… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23577v2-abstract-full').style.display = 'inline'; document.getElementById('2410.23577v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23577v2-abstract-full" style="display: none;"> Non-semantic context information is crucial for visual recognition, as the human visual perception system first uses global statistics to process scenes rapidly before identifying specific objects. However, while semantic information is increasingly incorporated into computer vision tasks such as image reconstruction, non-semantic information, such as global spatial structures, is often overlooked. To bridge the gap, we propose a biologically informed non-semantic context descriptor, \textbf{MS-Glance}, along with the Glance Index Measure for comparing two images. A Global Glance vector is formulated by randomly retrieving pixels based on a perception-driven rule from an image to form a vector representing non-semantic global context, while a local Glance vector is a flattened local image window, mimicking a zoom-in observation. The Glance Index is defined as the inner product of two standardized sets of Glance vectors. We evaluate the effectiveness of incorporating Glance supervision in two reconstruction tasks: image fitting with implicit neural representation (INR) and undersampled MRI reconstruction. Extensive experimental results show that MS-Glance outperforms existing image restoration losses across both natural and medical images. The code is available at \url{https://github.com/Z7Gao/MSGlance}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23577v2-abstract-full').style.display = 'none'; document.getElementById('2410.23577v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by WACV 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.14882">arXiv:2410.14882</a> <span> [<a href="https://arxiv.org/pdf/2410.14882">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Multi-diseases detection with memristive system on chip </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+Z">Zihan Wang</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+D+W">Daniel W. Yang</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+Z">Zerui Liu</a>, <a href="/search/eess?searchtype=author&query=Yan%2C+E">Evan Yan</a>, <a href="/search/eess?searchtype=author&query=Sun%2C+H">Heming Sun</a>, <a href="/search/eess?searchtype=author&query=Ge%2C+N">Ning Ge</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+M">Miao Hu</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+W">Wei Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.14882v1-abstract-short" style="display: inline;"> This study presents the first implementation of multilayer neural networks on a memristor/CMOS integrated system on chip (SoC) to simultaneously detect multiple diseases. To overcome limitations in medical data, generative AI techniques are used to enhance the dataset, improving the classifier's robustness and diversity. The system achieves notable performance with low latency, high accuracy (91.8… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14882v1-abstract-full').style.display = 'inline'; document.getElementById('2410.14882v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.14882v1-abstract-full" style="display: none;"> This study presents the first implementation of multilayer neural networks on a memristor/CMOS integrated system on chip (SoC) to simultaneously detect multiple diseases. To overcome limitations in medical data, generative AI techniques are used to enhance the dataset, improving the classifier's robustness and diversity. The system achieves notable performance with low latency, high accuracy (91.82%), and energy efficiency, facilitated by end-to-end execution on a memristor-based SoC with ten 256x256 crossbar arrays and an integrated on-chip processor. This research showcases the transformative potential of memristive in-memory computing hardware in accelerating machine learning applications for medical diagnostics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14882v1-abstract-full').style.display = 'none'; document.getElementById('2410.14882v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages, 5 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> C.1.3; I.2.0 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13419">arXiv:2410.13419</a> <span> [<a href="https://arxiv.org/pdf/2410.13419">pdf</a>, <a href="https://arxiv.org/format/2410.13419">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> MeloTrans: A Text to Symbolic Music Generation Model Following Human Composition Habit </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yutian Wang</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+W">Wanyin Yang</a>, <a href="/search/eess?searchtype=author&query=Dai%2C+Z">Zhenrong Dai</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Y">Yilong Zhang</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+K">Kun Zhao</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+H">Hui Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13419v1-abstract-short" style="display: inline;"> At present, neural network models show powerful sequence prediction ability and are used in many automatic composition models. In comparison, the way humans compose music is very different from it. Composers usually start by creating musical motifs and then develop them into music through a series of rules. This process ensures that the music has a specific structure and changing pattern. However,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13419v1-abstract-full').style.display = 'inline'; document.getElementById('2410.13419v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13419v1-abstract-full" style="display: none;"> At present, neural network models show powerful sequence prediction ability and are used in many automatic composition models. In comparison, the way humans compose music is very different from it. Composers usually start by creating musical motifs and then develop them into music through a series of rules. This process ensures that the music has a specific structure and changing pattern. However, it is difficult for neural network models to learn these composition rules from training data, which results in a lack of musicality and diversity in the generated music. This paper posits that integrating the learning capabilities of neural networks with human-derived knowledge may lead to better results. To archive this, we develop the POP909$\_$M dataset, the first to include labels for musical motifs and their variants, providing a basis for mimicking human compositional habits. Building on this, we propose MeloTrans, a text-to-music composition model that employs principles of motif development rules. Our experiments demonstrate that MeloTrans excels beyond existing music generation models and even surpasses Large Language Models (LLMs) like ChatGPT-4. This highlights the importance of merging human insights with neural network capabilities to achieve superior symbolic music generation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13419v1-abstract-full').style.display = 'none'; document.getElementById('2410.13419v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.04128">arXiv:2410.04128</a> <span> [<a href="https://arxiv.org/pdf/2410.04128">pdf</a>, <a href="https://arxiv.org/format/2410.04128">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Optimizing Medical Image Segmentation with Advanced Decoder Design </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yang%2C+W">Weibin Yang</a>, <a href="/search/eess?searchtype=author&query=Dong%2C+Z">Zhiqi Dong</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+M">Mingyuan Xu</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+L">Longwei Xu</a>, <a href="/search/eess?searchtype=author&query=Geng%2C+D">Dehua Geng</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yusong Li</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+P">Pengwei Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.04128v1-abstract-short" style="display: inline;"> U-Net is widely used in medical image segmentation due to its simple and flexible architecture design. To address the challenges of scale and complexity in medical tasks, several variants of U-Net have been proposed. In particular, methods based on Vision Transformer (ViT), represented by Swin UNETR, have gained widespread attention in recent years. However, these improvements often focus on the e… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04128v1-abstract-full').style.display = 'inline'; document.getElementById('2410.04128v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.04128v1-abstract-full" style="display: none;"> U-Net is widely used in medical image segmentation due to its simple and flexible architecture design. To address the challenges of scale and complexity in medical tasks, several variants of U-Net have been proposed. In particular, methods based on Vision Transformer (ViT), represented by Swin UNETR, have gained widespread attention in recent years. However, these improvements often focus on the encoder, overlooking the crucial role of the decoder in optimizing segmentation details. This design imbalance limits the potential for further enhancing segmentation performance. To address this issue, we analyze the roles of various decoder components, including upsampling method, skip connection, and feature extraction module, as well as the shortcomings of existing methods. Consequently, we propose Swin DER (i.e., Swin UNETR Decoder Enhanced and Refined) by specifically optimizing the design of these three components. Swin DER performs upsampling using learnable interpolation algorithm called offset coordinate neighborhood weighted up sampling (Onsampling) and replaces traditional skip connection with spatial-channel parallel attention gate (SCP AG). Additionally, Swin DER introduces deformable convolution along with attention mechanism in the feature extraction module of the decoder. Our model design achieves excellent results, surpassing other state-of-the-art methods on both the Synapse and the MSD brain tumor segmentation task. Code is available at: https://github.com/WillBeanYang/Swin-DER <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04128v1-abstract-full').style.display = 'none'; document.getElementById('2410.04128v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.00796">arXiv:2410.00796</a> <span> [<a href="https://arxiv.org/pdf/2410.00796">pdf</a>, <a href="https://arxiv.org/format/2410.00796">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> </div> </div> <p class="title is-5 mathjax"> Fast and Reliable $N-k$ Contingency Screening with Input-Convex Neural Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Christianson%2C+N">Nicolas Christianson</a>, <a href="/search/eess?searchtype=author&query=Cui%2C+W">Wenqi Cui</a>, <a href="/search/eess?searchtype=author&query=Low%2C+S">Steven Low</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+W">Weiwei Yang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+B">Baosen Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.00796v1-abstract-short" style="display: inline;"> Power system operators must ensure that dispatch decisions remain feasible in case of grid outages or contingencies to prevent cascading failures and ensure reliable operation. However, checking the feasibility of all $N - k$ contingencies -- every possible simultaneous failure of $k$ grid components -- is computationally intractable for even small $k$, requiring system operators to resort to heur… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00796v1-abstract-full').style.display = 'inline'; document.getElementById('2410.00796v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.00796v1-abstract-full" style="display: none;"> Power system operators must ensure that dispatch decisions remain feasible in case of grid outages or contingencies to prevent cascading failures and ensure reliable operation. However, checking the feasibility of all $N - k$ contingencies -- every possible simultaneous failure of $k$ grid components -- is computationally intractable for even small $k$, requiring system operators to resort to heuristic screening methods. Because of the increase in uncertainty and changes in system behaviors, heuristic lists might not include all relevant contingencies, generating false negatives in which unsafe scenarios are misclassified as safe. In this work, we propose to use input-convex neural networks (ICNNs) for contingency screening. We show that ICNN reliability can be determined by solving a convex optimization problem, and by scaling model weights using this problem as a differentiable optimization layer during training, we can learn an ICNN classifier that is both data-driven and has provably guaranteed reliability. Namely, our method can ensure a zero false negative rate. We empirically validate this methodology in a case study on the IEEE 39-bus test network, observing that it yields substantial (10-20x) speedups while having excellent classification accuracy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00796v1-abstract-full').style.display = 'none'; document.getElementById('2410.00796v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.09396">arXiv:2409.09396</a> <span> [<a href="https://arxiv.org/pdf/2409.09396">pdf</a>, <a href="https://arxiv.org/format/2409.09396">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Channel Adaptation for Speaker Verification Using Optimal Transport with Pseudo Label </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yang%2C+W">Wenhao Yang</a>, <a href="/search/eess?searchtype=author&query=Wei%2C+J">Jianguo Wei</a>, <a href="/search/eess?searchtype=author&query=Lu%2C+W">Wenhuan Lu</a>, <a href="/search/eess?searchtype=author&query=Li%2C+L">Lei Li</a>, <a href="/search/eess?searchtype=author&query=Lu%2C+X">Xugang Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.09396v1-abstract-short" style="display: inline;"> Domain gap often degrades the performance of speaker verification (SV) systems when the statistical distributions of training data and real-world test speech are mismatched. Channel variation, a primary factor causing this gap, is less addressed than other issues (e.g., noise). Although various domain adaptation algorithms could be applied to handle this domain gap problem, most algorithms could n… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09396v1-abstract-full').style.display = 'inline'; document.getElementById('2409.09396v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.09396v1-abstract-full" style="display: none;"> Domain gap often degrades the performance of speaker verification (SV) systems when the statistical distributions of training data and real-world test speech are mismatched. Channel variation, a primary factor causing this gap, is less addressed than other issues (e.g., noise). Although various domain adaptation algorithms could be applied to handle this domain gap problem, most algorithms could not take the complex distribution structure in domain alignment with discriminative learning. In this paper, we propose a novel unsupervised domain adaptation method, i.e., Joint Partial Optimal Transport with Pseudo Label (JPOT-PL), to alleviate the channel mismatch problem. Leveraging the geometric-aware distance metric of optimal transport in distribution alignment, we further design a pseudo label-based discriminative learning where the pseudo label can be regarded as a new type of soft speaker label derived from the optimal coupling. With the JPOT-PL, we carry out experiments on the SV channel adaptation task with VoxCeleb as the basis corpus. Experiments show our method reduces EER by over 10% compared with several state-of-the-art channel adaptation algorithms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09396v1-abstract-full').style.display = 'none'; document.getElementById('2409.09396v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 3 figures, submitted to ICASSP 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.09389">arXiv:2409.09389</a> <span> [<a href="https://arxiv.org/pdf/2409.09389">pdf</a>, <a href="https://arxiv.org/format/2409.09389">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Integrated Multi-Level Knowledge Distillation for Enhanced Speaker Verification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yang%2C+W">Wenhao Yang</a>, <a href="/search/eess?searchtype=author&query=Wei%2C+J">Jianguo Wei</a>, <a href="/search/eess?searchtype=author&query=Lu%2C+W">Wenhuan Lu</a>, <a href="/search/eess?searchtype=author&query=Lu%2C+X">Xugang Lu</a>, <a href="/search/eess?searchtype=author&query=Li%2C+L">Lei Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.09389v1-abstract-short" style="display: inline;"> Knowledge distillation (KD) is widely used in audio tasks, such as speaker verification (SV), by transferring knowledge from a well-trained large model (the teacher) to a smaller, more compact model (the student) for efficiency and portability. Existing KD methods for SV often mirror those used in image processing, focusing on approximating predicted probabilities and hidden representations. Howev… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09389v1-abstract-full').style.display = 'inline'; document.getElementById('2409.09389v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.09389v1-abstract-full" style="display: none;"> Knowledge distillation (KD) is widely used in audio tasks, such as speaker verification (SV), by transferring knowledge from a well-trained large model (the teacher) to a smaller, more compact model (the student) for efficiency and portability. Existing KD methods for SV often mirror those used in image processing, focusing on approximating predicted probabilities and hidden representations. However, these methods fail to account for the multi-level temporal properties of speech audio. In this paper, we propose a novel KD method, i.e., Integrated Multi-level Knowledge Distillation (IML-KD), to transfer knowledge of various temporal-scale features of speech from a teacher model to a student model. In the IML-KD, temporal context information from the teacher model is integrated into novel Integrated Gradient-based input-sensitive representations from speech segments with various durations, and the student model is trained to infer these representations with multi-level alignment for the output. We conduct SV experiments on the VoxCeleb1 dataset to evaluate the proposed method. Experimental results demonstrate that IML-KD significantly enhances KD performance, reducing the Equal Error Rate (EER) by 5%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09389v1-abstract-full').style.display = 'none'; document.getElementById('2409.09389v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 3 figures, submitted to ICASSP 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.12126">arXiv:2408.12126</a> <span> [<a href="https://arxiv.org/pdf/2408.12126">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Robust Input Shaping Vibration Control via Extended Kalman Filter-Incorporated Residual Neural Network </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yang%2C+W">Weiyi Yang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+S">Shuai Li</a>, <a href="/search/eess?searchtype=author&query=Luo%2C+X">Xin Luo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.12126v1-abstract-short" style="display: inline;"> With the rapid development of industry, the vibration control of flexible structures and underactuated systems has been increasingly gaining attention. Input shaping technology enables stable performance for high-speed motion in industrial motion systems. However, existing input shapers generally suffer from the ineffective control performance due to the neglect of observation errors. To address t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12126v1-abstract-full').style.display = 'inline'; document.getElementById('2408.12126v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.12126v1-abstract-full" style="display: none;"> With the rapid development of industry, the vibration control of flexible structures and underactuated systems has been increasingly gaining attention. Input shaping technology enables stable performance for high-speed motion in industrial motion systems. However, existing input shapers generally suffer from the ineffective control performance due to the neglect of observation errors. To address this critical issue, this paper proposes an Extended Kalman Filter-incorporated Residual Neural Network-based input Shaping (ERS) model for vibration control. Its main ideas are two-fold: a) adopting an extended Kalman filter to address a vertical flexible beam's model errors; and b) adopting a residual neural network to cascade with the extended Kalman filter for eliminating the remaining observation errors. Detailed experiments on a real dataset collected from a vertical flexible beam demonstrate that the proposed ERS model has achieved significant vibration control performance over several state-of-the-art models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12126v1-abstract-full').style.display = 'none'; document.getElementById('2408.12126v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.04927">arXiv:2408.04927</a> <span> [<a href="https://arxiv.org/pdf/2408.04927">pdf</a>, <a href="https://arxiv.org/format/2408.04927">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Large Models for Aerial Edges: An Edge-Cloud Model Evolution and Communication Paradigm </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+S">Shuhang Zhang</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+Q">Qingyu Liu</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+K">Ke Chen</a>, <a href="/search/eess?searchtype=author&query=Di%2C+B">Boya Di</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+H">Hongliang Zhang</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+W">Wenhan Yang</a>, <a href="/search/eess?searchtype=author&query=Niyato%2C+D">Dusit Niyato</a>, <a href="/search/eess?searchtype=author&query=Han%2C+Z">Zhu Han</a>, <a href="/search/eess?searchtype=author&query=Poor%2C+H+V">H. Vincent Poor</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.04927v1-abstract-short" style="display: inline;"> The future sixth-generation (6G) of wireless networks is expected to surpass its predecessors by offering ubiquitous coverage through integrated air-ground facility deployments in both communication and computing domains. In this network, aerial facilities, such as unmanned aerial vehicles (UAVs), conduct artificial intelligence (AI) computations based on multi-modal data to support diverse applic… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.04927v1-abstract-full').style.display = 'inline'; document.getElementById('2408.04927v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.04927v1-abstract-full" style="display: none;"> The future sixth-generation (6G) of wireless networks is expected to surpass its predecessors by offering ubiquitous coverage through integrated air-ground facility deployments in both communication and computing domains. In this network, aerial facilities, such as unmanned aerial vehicles (UAVs), conduct artificial intelligence (AI) computations based on multi-modal data to support diverse applications including surveillance and environment construction. However, these multi-domain inference and content generation tasks require large AI models, demanding powerful computing capabilities, thus posing significant challenges for UAVs. To tackle this problem, we propose an integrated edge-cloud model evolution framework, where UAVs serve as edge nodes for data collection and edge model computation. Through wireless channels, UAVs collaborate with ground cloud servers, providing cloud model computation and model updating for edge UAVs. With limited wireless communication bandwidth, the proposed framework faces the challenge of information exchange scheduling between the edge UAVs and the cloud server. To tackle this, we present joint task allocation, transmission resource allocation, transmission data quantization design, and edge model update design to enhance the inference accuracy of the integrated air-ground edge-cloud model evolution framework by mean average precision (mAP) maximization. A closed-form lower bound on the mAP of the proposed framework is derived, and the solution to the mAP maximization problem is optimized accordingly. Simulations, based on results from vision-based classification experiments, consistently demonstrate that the mAP of the proposed framework outperforms both a centralized cloud model framework and a distributed edge model framework across various communication bandwidths and data sizes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.04927v1-abstract-full').style.display = 'none'; document.getElementById('2408.04927v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.20585">arXiv:2407.20585</a> <span> [<a href="https://arxiv.org/pdf/2407.20585">pdf</a>, <a href="https://arxiv.org/format/2407.20585">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> A UAV-Enabled Time-Sensitive Data Collection Scheme for Grassland Monitoring Edge Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Jiao%2C+D">Dongbin Jiao</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Z">Zihao Wang</a>, <a href="/search/eess?searchtype=author&query=Fan%2C+W">Wen Fan</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+W">Weibo Yang</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+P">Peng Yang</a>, <a href="/search/eess?searchtype=author&query=Shang%2C+Z">Zhanhuan Shang</a>, <a href="/search/eess?searchtype=author&query=Yan%2C+S">Shi Yan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.20585v2-abstract-short" style="display: inline;"> Grassland monitoring is essential for the sustainable development of grassland resources. Traditional Internet of Things (IoT) devices generate critical ecological data, making data loss unacceptable, but the harsh environment complicates data collection. Unmanned Aerial Vehicle (UAV) and mobile edge computing (MEC) offer efficient data collection solutions, enhancing performance on resource-limit… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.20585v2-abstract-full').style.display = 'inline'; document.getElementById('2407.20585v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.20585v2-abstract-full" style="display: none;"> Grassland monitoring is essential for the sustainable development of grassland resources. Traditional Internet of Things (IoT) devices generate critical ecological data, making data loss unacceptable, but the harsh environment complicates data collection. Unmanned Aerial Vehicle (UAV) and mobile edge computing (MEC) offer efficient data collection solutions, enhancing performance on resource-limited mobile devices. In this context, this paper is the first to investigate a UAV-enabled time-sensitive data collection problem (TSDCMP) within grassland monitoring edge networks (GMENs). Unlike many existing data collection scenarios, this problem has three key challenges. First, the total amount of data collected depends significantly on the data collection duration and arrival time of UAV at each access point (AP). Second, the volume of data at different APs varies among regions due to differences in monitoring objects and vegetation coverage. Third, the service requests time and locations from APs are often not adjacent topologically. To address these issues, We formulate the TSDCMP for UAV-enabled GMENs as a mixed-integer programming model in a single trip. This model considers constraints such as the limited energy of UAV, the coupled routing and time scheduling, and the state of APs and UAV arrival time. Subsequently, we propose a novel cooperative heuristic algorithm based on temporal-spatial correlations (CHTSC) that integrates a modified dynamic programming (MDP) into an iterated local search to solve the TSDCMP for UAV-enabled GMENs. This approach fully takes into account the temporal and spatial relationships between consecutive service requests from APs. Systematic simulation studies demonstrate that the mixed-integer programming model effectively represents the TSDCMP within UAV-enabled GMENs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.20585v2-abstract-full').style.display = 'none'; document.getElementById('2407.20585v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.15395">arXiv:2407.15395</a> <span> [<a href="https://arxiv.org/pdf/2407.15395">pdf</a>, <a href="https://arxiv.org/format/2407.15395">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> FAST-GSC: Fast and Adaptive Semantic Transmission for Generative Semantic Communication </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yiru Wang</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+W">Wanting Yang</a>, <a href="/search/eess?searchtype=author&query=Xiong%2C+Z">Zehui Xiong</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+Y">Yuping Zhao</a>, <a href="/search/eess?searchtype=author&query=Mao%2C+S">Shiwen Mao</a>, <a href="/search/eess?searchtype=author&query=Quek%2C+T+Q+S">Tony Q. S. Quek</a>, <a href="/search/eess?searchtype=author&query=Poor%2C+H+V">H. Vincent Poor</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.15395v1-abstract-short" style="display: inline;"> The rapidly evolving field of generative artificial intelligence technology has introduced innovative approaches for developing semantic communication (SemCom) frameworks, leading to the emergence of a new paradigm-generative SemCom (GSC). However, the complex processes involved in semantic extraction and generative inference may result in considerable latency in resource-constrained scenarios. To… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.15395v1-abstract-full').style.display = 'inline'; document.getElementById('2407.15395v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.15395v1-abstract-full" style="display: none;"> The rapidly evolving field of generative artificial intelligence technology has introduced innovative approaches for developing semantic communication (SemCom) frameworks, leading to the emergence of a new paradigm-generative SemCom (GSC). However, the complex processes involved in semantic extraction and generative inference may result in considerable latency in resource-constrained scenarios. To tackle these issues, we introduce a new GSC framework that involves fast and adaptive semantic transmission (FAST-GSC). This framework incorporates one innovative communication mechanism and two enhancement strategies at the transmitter and receiver, respectively. Aiming to reduce task latency, our communication mechanism enables fast semantic transmission by parallelizing the processes of semantic extraction at the transmitter and inference at the receiver. Preliminary evaluations indicate that while this mechanism effectively reduces task latency, it could potentially compromise task performance. To address this issue, we propose two additional methods for enhancement. First, at the transmitter, we employ reinforcement learning to discern the intrinsic temporal dependencies among the semantic units and design their extraction and transmission sequence accordingly. Second, at the receiver, we design a semantic difference calculation module and propose a sequential conditional denoising approach to alleviate the stringent immediacy requirement for the reception of semantic features. Extensive experiments demonstrate that our proposed architecture achieves a performance score comparable to the conventional GSC architecture while realizing a 52% reduction in residual task latency that extends beyond the fixed inference duration. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.15395v1-abstract-full').style.display = 'none'; document.getElementById('2407.15395v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.09562">arXiv:2407.09562</a> <span> [<a href="https://arxiv.org/pdf/2407.09562">pdf</a>, <a href="https://arxiv.org/format/2407.09562">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1016/j.compag.2024.109432">10.1016/j.compag.2024.109432 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Edge AI-Enabled Chicken Health Detection Based on Enhanced FCOS-Lite and Knowledge Distillation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Tong%2C+Q">Qiang Tong</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+J">Jinrui Wang</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+W">Wenshuang Yang</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+S">Songtao Wu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+W">Wenqi Zhang</a>, <a href="/search/eess?searchtype=author&query=Sun%2C+C">Chen Sun</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+K">Kuanhong Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.09562v3-abstract-short" style="display: inline;"> The utilization of AIoT technology has become a crucial trend in modern poultry management, offering the potential to optimize farming operations and reduce human workloads. This paper presents a real-time and compact edge-AI enabled detector designed to identify chickens and their healthy statuses using frames captured by a lightweight and intelligent camera equipped with an edge-AI enabled CMOS… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.09562v3-abstract-full').style.display = 'inline'; document.getElementById('2407.09562v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.09562v3-abstract-full" style="display: none;"> The utilization of AIoT technology has become a crucial trend in modern poultry management, offering the potential to optimize farming operations and reduce human workloads. This paper presents a real-time and compact edge-AI enabled detector designed to identify chickens and their healthy statuses using frames captured by a lightweight and intelligent camera equipped with an edge-AI enabled CMOS sensor. To ensure efficient deployment of the proposed compact detector within the memory-constrained edge-AI enabled CMOS sensor, we employ a FCOS-Lite detector leveraging MobileNet as the backbone. To mitigate the issue of reduced accuracy in compact edge-AI detectors without incurring additional inference costs, we propose a gradient weighting loss function as classification loss and introduce CIOU loss function as localization loss. Additionally, we propose a knowledge distillation scheme to transfer valuable information from a large teacher detector to the proposed FCOS-Lite detector, thereby enhancing its performance while preserving a compact model size. Experimental results demonstrate the proposed edge-AI enabled detector achieves commendable performance metrics, including a mean average precision (mAP) of 95.1$\%$ and an F1-score of 94.2$\%$, etc. Notably, the proposed detector can be efficiently deployed and operates at a speed exceeding 20 FPS on the edge-AI enabled CMOS sensor, achieved through int8 quantization. That meets practical demands for automated poultry health monitoring using lightweight intelligent cameras with low power consumption and minimal bandwidth costs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.09562v3-abstract-full').style.display = 'none'; document.getElementById('2407.09562v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.15846">arXiv:2406.15846</a> <span> [<a href="https://arxiv.org/pdf/2406.15846">pdf</a>, <a href="https://arxiv.org/format/2406.15846">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Revisiting Interpolation Augmentation for Speech-to-Text Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Xu%2C+C">Chen Xu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+J">Jie Wang</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+X">Xiaoqian Liu</a>, <a href="/search/eess?searchtype=author&query=Dong%2C+Q">Qianqian Dong</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+C">Chunliang Zhang</a>, <a href="/search/eess?searchtype=author&query=Xiao%2C+T">Tong Xiao</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+J">Jingbo Zhu</a>, <a href="/search/eess?searchtype=author&query=Man%2C+D">Dapeng Man</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+W">Wu Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.15846v1-abstract-short" style="display: inline;"> Speech-to-text (S2T) generation systems frequently face challenges in low-resource scenarios, primarily due to the lack of extensive labeled datasets. One emerging solution is constructing virtual training samples by interpolating inputs and labels, which has notably enhanced system generalization in other domains. Despite its potential, this technique's application in S2T tasks has remained under… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.15846v1-abstract-full').style.display = 'inline'; document.getElementById('2406.15846v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.15846v1-abstract-full" style="display: none;"> Speech-to-text (S2T) generation systems frequently face challenges in low-resource scenarios, primarily due to the lack of extensive labeled datasets. One emerging solution is constructing virtual training samples by interpolating inputs and labels, which has notably enhanced system generalization in other domains. Despite its potential, this technique's application in S2T tasks has remained under-explored. In this paper, we delve into the utility of interpolation augmentation, guided by several pivotal questions. Our findings reveal that employing an appropriate strategy in interpolation augmentation significantly enhances performance across diverse tasks, architectures, and data scales, offering a promising avenue for more robust S2T systems in resource-constrained settings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.15846v1-abstract-full').style.display = 'none'; document.getElementById('2406.15846v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ACL 2024 Findings</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.14052">arXiv:2406.14052</a> <span> [<a href="https://arxiv.org/pdf/2406.14052">pdf</a>, <a href="https://arxiv.org/format/2406.14052">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Perspective+ Unet: Enhancing Segmentation with Bi-Path Fusion and Efficient Non-Local Attention for Superior Receptive Fields </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Hu%2C+J">Jintong Hu</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+S">Siyan Chen</a>, <a href="/search/eess?searchtype=author&query=Pan%2C+Z">Zhiyi Pan</a>, <a href="/search/eess?searchtype=author&query=Zeng%2C+S">Sen Zeng</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+W">Wenming Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.14052v1-abstract-short" style="display: inline;"> Precise segmentation of medical images is fundamental for extracting critical clinical information, which plays a pivotal role in enhancing the accuracy of diagnoses, formulating effective treatment plans, and improving patient outcomes. Although Convolutional Neural Networks (CNNs) and non-local attention methods have achieved notable success in medical image segmentation, they either struggle to… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.14052v1-abstract-full').style.display = 'inline'; document.getElementById('2406.14052v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.14052v1-abstract-full" style="display: none;"> Precise segmentation of medical images is fundamental for extracting critical clinical information, which plays a pivotal role in enhancing the accuracy of diagnoses, formulating effective treatment plans, and improving patient outcomes. Although Convolutional Neural Networks (CNNs) and non-local attention methods have achieved notable success in medical image segmentation, they either struggle to capture long-range spatial dependencies due to their reliance on local features, or face significant computational and feature integration challenges when attempting to address this issue with global attention mechanisms. To overcome existing limitations in medical image segmentation, we propose a novel architecture, Perspective+ Unet. This framework is characterized by three major innovations: (i) It introduces a dual-pathway strategy at the encoder stage that combines the outcomes of traditional and dilated convolutions. This not only maintains the local receptive field but also significantly expands it, enabling better comprehension of the global structure of images while retaining detail sensitivity. (ii) The framework incorporates an efficient non-local transformer block, named ENLTB, which utilizes kernel function approximation for effective long-range dependency capture with linear computational and spatial complexity. (iii) A Spatial Cross-Scale Integrator strategy is employed to merge global dependencies and local contextual cues across model stages, meticulously refining features from various levels to harmonize global and local information. Experimental results on the ACDC and Synapse datasets demonstrate the effectiveness of our proposed Perspective+ Unet. The code is available in the supplementary material. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.14052v1-abstract-full').style.display = 'none'; document.getElementById('2406.14052v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.10956">arXiv:2406.10956</a> <span> [<a href="https://arxiv.org/pdf/2406.10956">pdf</a>, <a href="https://arxiv.org/format/2406.10956">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Robust Channel Learning for Large-Scale Radio Speaker Verification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yang%2C+W">Wenhao Yang</a>, <a href="/search/eess?searchtype=author&query=Wei%2C+J">Jianguo Wei</a>, <a href="/search/eess?searchtype=author&query=Lu%2C+W">Wenhuan Lu</a>, <a href="/search/eess?searchtype=author&query=Li%2C+L">Lei Li</a>, <a href="/search/eess?searchtype=author&query=Lu%2C+X">Xugang Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.10956v1-abstract-short" style="display: inline;"> Recent research in speaker verification has increasingly focused on achieving robust and reliable recognition under challenging channel conditions and noisy environments. Identifying speakers in radio communications is particularly difficult due to inherent limitations such as constrained bandwidth and pervasive noise interference. To address this issue, we present a Channel Robust Speaker Learnin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.10956v1-abstract-full').style.display = 'inline'; document.getElementById('2406.10956v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.10956v1-abstract-full" style="display: none;"> Recent research in speaker verification has increasingly focused on achieving robust and reliable recognition under challenging channel conditions and noisy environments. Identifying speakers in radio communications is particularly difficult due to inherent limitations such as constrained bandwidth and pervasive noise interference. To address this issue, we present a Channel Robust Speaker Learning (CRSL) framework that enhances the robustness of the current speaker verification pipeline, considering data source, data augmentation, and the efficiency of model transfer processes. Our framework introduces an augmentation module that mitigates bandwidth variations in radio speech datasets by manipulating the bandwidth of training inputs. It also addresses unknown noise by introducing noise within the manifold space. Additionally, we propose an efficient fine-tuning method that reduces the need for extensive additional training time and large amounts of data. Moreover, we develop a toolkit for assembling a large-scale radio speech corpus and establish a benchmark specifically tailored for radio scenario speaker verification studies. Experimental results demonstrate that our proposed methodology effectively enhances performance and mitigates degradation caused by radio transmission in speaker verification tasks. The code will be available on Github. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.10956v1-abstract-full').style.display = 'none'; document.getElementById('2406.10956v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 11 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.10236">arXiv:2406.10236</a> <span> [<a href="https://arxiv.org/pdf/2406.10236">pdf</a>, <a href="https://arxiv.org/format/2406.10236">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Lightening Anything in Medical Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Fei%2C+B">Ben Fei</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yixuan Li</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+W">Weidong Yang</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+H">Hengjun Gao</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+J">Jingyi Xu</a>, <a href="/search/eess?searchtype=author&query=Ma%2C+L">Lipeng Ma</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+Y">Yatian Yang</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+P">Pinghong Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.10236v1-abstract-short" style="display: inline;"> The development of medical imaging techniques has made a significant contribution to clinical decision-making. However, the existence of suboptimal imaging quality, as indicated by irregular illumination or imbalanced intensity, presents significant obstacles in automating disease screening, analysis, and diagnosis. Existing approaches for natural image enhancement are mostly trained with numerous… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.10236v1-abstract-full').style.display = 'inline'; document.getElementById('2406.10236v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.10236v1-abstract-full" style="display: none;"> The development of medical imaging techniques has made a significant contribution to clinical decision-making. However, the existence of suboptimal imaging quality, as indicated by irregular illumination or imbalanced intensity, presents significant obstacles in automating disease screening, analysis, and diagnosis. Existing approaches for natural image enhancement are mostly trained with numerous paired images, presenting challenges in data collection and training costs, all while lacking the ability to generalize effectively. Here, we introduce a pioneering training-free Diffusion Model for Universal Medical Image Enhancement, named UniMIE. UniMIE demonstrates its unsupervised enhancement capabilities across various medical image modalities without the need for any fine-tuning. It accomplishes this by relying solely on a single pre-trained model from ImageNet. We conduct a comprehensive evaluation on 13 imaging modalities and over 15 medical types, demonstrating better qualities, robustness, and accuracy than other modality-specific and data-inefficient models. By delivering high-quality enhancement and corresponding accuracy downstream tasks across a wide range of tasks, UniMIE exhibits considerable potential to accelerate the advancement of diagnostic tools and customized treatment plans. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.10236v1-abstract-full').style.display = 'none'; document.getElementById('2406.10236v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">23 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.18790">arXiv:2405.18790</a> <span> [<a href="https://arxiv.org/pdf/2405.18790">pdf</a>, <a href="https://arxiv.org/format/2405.18790">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Opinion-Unaware Blind Image Quality Assessment using Multi-Scale Deep Feature Statistics </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ni%2C+Z">Zhangkai Ni</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+Y">Yue Liu</a>, <a href="/search/eess?searchtype=author&query=Ding%2C+K">Keyan Ding</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+W">Wenhan Yang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+H">Hanli Wang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+S">Shiqi Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.18790v1-abstract-short" style="display: inline;"> Deep learning-based methods have significantly influenced the blind image quality assessment (BIQA) field, however, these methods often require training using large amounts of human rating data. In contrast, traditional knowledge-based methods are cost-effective for training but face challenges in effectively extracting features aligned with human visual perception. To bridge these gaps, we propos… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.18790v1-abstract-full').style.display = 'inline'; document.getElementById('2405.18790v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.18790v1-abstract-full" style="display: none;"> Deep learning-based methods have significantly influenced the blind image quality assessment (BIQA) field, however, these methods often require training using large amounts of human rating data. In contrast, traditional knowledge-based methods are cost-effective for training but face challenges in effectively extracting features aligned with human visual perception. To bridge these gaps, we propose integrating deep features from pre-trained visual models with a statistical analysis model into a Multi-scale Deep Feature Statistics (MDFS) model for achieving opinion-unaware BIQA (OU-BIQA), thereby eliminating the reliance on human rating data and significantly improving training efficiency. Specifically, we extract patch-wise multi-scale features from pre-trained vision models, which are subsequently fitted into a multivariate Gaussian (MVG) model. The final quality score is determined by quantifying the distance between the MVG model derived from the test image and the benchmark MVG model derived from the high-quality image set. A comprehensive series of experiments conducted on various datasets show that our proposed model exhibits superior consistency with human visual perception compared to state-of-the-art BIQA models. Furthermore, it shows improved generalizability across diverse target-specific BIQA tasks. Our code is available at: https://github.com/eezkni/MDFS <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.18790v1-abstract-full').style.display = 'none'; document.getElementById('2405.18790v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to IEEE Transactions on Multimedia 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.12357">arXiv:2405.12357</a> <span> [<a href="https://arxiv.org/pdf/2405.12357">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Paired Conditional Generative Adversarial Network for Highly Accelerated Liver 4D MRI </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Xu%2C+D">Di Xu</a>, <a href="/search/eess?searchtype=author&query=Miao%2C+X">Xin Miao</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+H">Hengjie Liu</a>, <a href="/search/eess?searchtype=author&query=Scholey%2C+J+E">Jessica E. Scholey</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+W">Wensha Yang</a>, <a href="/search/eess?searchtype=author&query=Feng%2C+M">Mary Feng</a>, <a href="/search/eess?searchtype=author&query=Ohliger%2C+M">Michael Ohliger</a>, <a href="/search/eess?searchtype=author&query=Lin%2C+H">Hui Lin</a>, <a href="/search/eess?searchtype=author&query=Lao%2C+Y">Yi Lao</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+Y">Yang Yang</a>, <a href="/search/eess?searchtype=author&query=Sheng%2C+K">Ke Sheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.12357v1-abstract-short" style="display: inline;"> Purpose: 4D MRI with high spatiotemporal resolution is desired for image-guided liver radiotherapy. Acquiring densely sampling k-space data is time-consuming. Accelerated acquisition with sparse samples is desirable but often causes degraded image quality or long reconstruction time. We propose the Reconstruct Paired Conditional Generative Adversarial Network (Re-Con-GAN) to shorten the 4D MRI rec… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.12357v1-abstract-full').style.display = 'inline'; document.getElementById('2405.12357v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.12357v1-abstract-full" style="display: none;"> Purpose: 4D MRI with high spatiotemporal resolution is desired for image-guided liver radiotherapy. Acquiring densely sampling k-space data is time-consuming. Accelerated acquisition with sparse samples is desirable but often causes degraded image quality or long reconstruction time. We propose the Reconstruct Paired Conditional Generative Adversarial Network (Re-Con-GAN) to shorten the 4D MRI reconstruction time while maintaining the reconstruction quality. Methods: Patients who underwent free-breathing liver 4D MRI were included in the study. Fully- and retrospectively under-sampled data at 3, 6 and 10 times (3x, 6x and 10x) were first reconstructed using the nuFFT algorithm. Re-Con-GAN then trained input and output in pairs. Three types of networks, ResNet9, UNet and reconstruction swin transformer, were explored as generators. PatchGAN was selected as the discriminator. Re-Con-GAN processed the data (3D+t) as temporal slices (2D+t). A total of 48 patients with 12332 temporal slices were split into training (37 patients with 10721 slices) and test (11 patients with 1611 slices). Results: Re-Con-GAN consistently achieved comparable/better PSNR, SSIM, and RMSE scores compared to CS/UNet models. The inference time of Re-Con-GAN, UNet and CS are 0.15s, 0.16s, and 120s. The GTV detection task showed that Re-Con-GAN and CS, compared to UNet, better improved the dice score (3x Re-Con-GAN 80.98%; 3x CS 80.74%; 3x UNet 79.88%) of unprocessed under-sampled images (3x 69.61%). Conclusion: A generative network with adversarial training is proposed with promising and efficient reconstruction results demonstrated on an in-house dataset. The rapid and qualitative reconstruction of 4D liver MR has the potential to facilitate online adaptive MR-guided radiotherapy for liver cancer. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.12357v1-abstract-full').style.display = 'none'; document.getElementById('2405.12357v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.00822">arXiv:2405.00822</a> <span> [<a href="https://arxiv.org/pdf/2405.00822">pdf</a>, <a href="https://arxiv.org/ps/2405.00822">ps</a>, <a href="https://arxiv.org/format/2405.00822">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Kernel-based Learning for Safe Control of Discrete-Time Unknown Systems under Incomplete Observations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yang%2C+Z">Zewen Yang</a>, <a href="/search/eess?searchtype=author&query=Dai%2C+X">Xiaobing Dai</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+W">Weijie Yang</a>, <a href="/search/eess?searchtype=author&query=%C4%B0lgen%2C+B">Bahar 陌lgen</a>, <a href="/search/eess?searchtype=author&query=An%C5%BEel%2C+A">Aleksandar An啪el</a>, <a href="/search/eess?searchtype=author&query=Hattab%2C+G">Georges Hattab</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.00822v1-abstract-short" style="display: inline;"> Safe control for dynamical systems is critical, yet the presence of unknown dynamics poses significant challenges. In this paper, we present a learning-based control approach for tracking control of a class of high-order systems, operating under the constraint of partially observable states. The uncertainties inherent within the systems are modeled by kernel ridge regression, leveraging the propos… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.00822v1-abstract-full').style.display = 'inline'; document.getElementById('2405.00822v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.00822v1-abstract-full" style="display: none;"> Safe control for dynamical systems is critical, yet the presence of unknown dynamics poses significant challenges. In this paper, we present a learning-based control approach for tracking control of a class of high-order systems, operating under the constraint of partially observable states. The uncertainties inherent within the systems are modeled by kernel ridge regression, leveraging the proposed strategic data acquisition approach with limited state measurements. To achieve accurate trajectory tracking, a state observer that seamlessly integrates with the control law is devised. The analysis of the guaranteed control performance is conducted using Lyapunov theory due to the deterministic prediction error bound of kernel ridge regression, ensuring the adaptability of the approach in safety-critical scenarios. To demonstrate the effectiveness of our proposed approach, numerical simulations are performed, underscoring its contributions to the advancement of control strategies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.00822v1-abstract-full').style.display = 'none'; document.getElementById('2405.00822v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.06765">arXiv:2404.06765</a> <span> [<a href="https://arxiv.org/pdf/2404.06765">pdf</a>, <a href="https://arxiv.org/format/2404.06765">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Harnessing the Power of AI-Generated Content for Semantic Communication </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yiru Wang</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+W">Wanting Yang</a>, <a href="/search/eess?searchtype=author&query=Xiong%2C+Z">Zehui Xiong</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+Y">Yuping Zhao</a>, <a href="/search/eess?searchtype=author&query=Quek%2C+T+Q+S">Tony Q. S. Quek</a>, <a href="/search/eess?searchtype=author&query=Han%2C+Z">Zhu Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.06765v1-abstract-short" style="display: inline;"> Semantic Communication (SemCom) is envisaged as the next-generation paradigm to address challenges stemming from the conflicts between the increasing volume of transmission data and the scarcity of spectrum resources. However, existing SemCom systems face drawbacks, such as low explainability, modality rigidity, and inadequate reconstruction functionality. Recognizing the transformative capabiliti… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.06765v1-abstract-full').style.display = 'inline'; document.getElementById('2404.06765v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.06765v1-abstract-full" style="display: none;"> Semantic Communication (SemCom) is envisaged as the next-generation paradigm to address challenges stemming from the conflicts between the increasing volume of transmission data and the scarcity of spectrum resources. However, existing SemCom systems face drawbacks, such as low explainability, modality rigidity, and inadequate reconstruction functionality. Recognizing the transformative capabilities of AI-generated content (AIGC) technologies in content generation, this paper explores a pioneering approach by integrating them into SemCom to address the aforementioned challenges. We employ a three-layer model to illustrate the proposed AIGC-assisted SemCom (AIGC-SCM) architecture, emphasizing its clear deviation from existing SemCom. Grounded in this model, we investigate various AIGC technologies with the potential to augment SemCom's performance. In alignment with SemCom's goal of conveying semantic meanings, we also introduce the new evaluation methods for our AIGC-SCM system. Subsequently, we explore communication scenarios where our proposed AIGC-SCM can realize its potential. For practical implementation, we construct a detailed integration workflow and conduct a case study in a virtual reality image transmission scenario. The results demonstrate our ability to maintain a high degree of alignment between the reconstructed content and the original source information, while substantially minimizing the data volume required for transmission. These findings pave the way for further enhancements in communication efficiency and the improvement of Quality of Service. At last, we present future directions for AIGC-SCM studies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.06765v1-abstract-full').style.display = 'none'; document.getElementById('2404.06765v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.04916">arXiv:2404.04916</a> <span> [<a href="https://arxiv.org/pdf/2404.04916">pdf</a>, <a href="https://arxiv.org/format/2404.04916">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Correcting Diffusion-Based Perceptual Image Compression with Privileged End-to-End Decoder </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ma%2C+Y">Yiyang Ma</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+W">Wenhan Yang</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+J">Jiaying Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.04916v2-abstract-short" style="display: inline;"> The images produced by diffusion models can attain excellent perceptual quality. However, it is challenging for diffusion models to guarantee distortion, hence the integration of diffusion models and image compression models still needs more comprehensive explorations. This paper presents a diffusion-based image compression method that employs a privileged end-to-end decoder model as correction, w… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.04916v2-abstract-full').style.display = 'inline'; document.getElementById('2404.04916v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.04916v2-abstract-full" style="display: none;"> The images produced by diffusion models can attain excellent perceptual quality. However, it is challenging for diffusion models to guarantee distortion, hence the integration of diffusion models and image compression models still needs more comprehensive explorations. This paper presents a diffusion-based image compression method that employs a privileged end-to-end decoder model as correction, which achieves better perceptual quality while guaranteeing the distortion to an extent. We build a diffusion model and design a novel paradigm that combines the diffusion model and an end-to-end decoder, and the latter is responsible for transmitting the privileged information extracted at the encoder side. Specifically, we theoretically analyze the reconstruction process of the diffusion models at the encoder side with the original images being visible. Based on the analysis, we introduce an end-to-end convolutional decoder to provide a better approximation of the score function $\nabla_{\mathbf{x}_t}\log p(\mathbf{x}_t)$ at the encoder side and effectively transmit the combination. Experiments demonstrate the superiority of our method in both distortion and perception compared with previous perceptual compression methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.04916v2-abstract-full').style.display = 'none'; document.getElementById('2404.04916v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICML 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.16797">arXiv:2403.16797</a> <span> [<a href="https://arxiv.org/pdf/2403.16797">pdf</a>, <a href="https://arxiv.org/format/2403.16797">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Privacy Preservation by Intermittent Transmission in Cooperative LQG Control Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Lin%2C+W">Wenhao Lin</a>, <a href="/search/eess?searchtype=author&query=Ni%2C+Y">Yuqing Ni</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+W">Wen Yang</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+C">Chao Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.16797v2-abstract-short" style="display: inline;"> In this paper, we study a cooperative linear quadratic Gaussian (LQG) control system with a single user and a server. In this system, the user runs a process and employs the server to meet the needs of computation. However, the user regards its state trajectories as privacy. Therefore, we propose a privacy scheme, in which the user sends data to the server intermittently. By this scheme, the serve… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.16797v2-abstract-full').style.display = 'inline'; document.getElementById('2403.16797v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.16797v2-abstract-full" style="display: none;"> In this paper, we study a cooperative linear quadratic Gaussian (LQG) control system with a single user and a server. In this system, the user runs a process and employs the server to meet the needs of computation. However, the user regards its state trajectories as privacy. Therefore, we propose a privacy scheme, in which the user sends data to the server intermittently. By this scheme, the server's received information of the user is reduced, and consequently the user's privacy is preserved. In this paper, we consider a periodic transmission scheme. We analyze the performance of privacy preservation and LQG control of different transmission periods. Under the given threshold of the control performance loss, a trade-off optimization problem is proposed. Finally, we give the solution to the optimization problem. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.16797v2-abstract-full').style.display = 'none'; document.getElementById('2403.16797v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.16384">arXiv:2403.16384</a> <span> [<a href="https://arxiv.org/pdf/2403.16384">pdf</a>, <a href="https://arxiv.org/format/2403.16384">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/ICASSP48485.2024.10447712">10.1109/ICASSP48485.2024.10447712 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Residual Dense Swin Transformer for Continuous Depth-Independent Ultrasound Imaging </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Hu%2C+J">Jintong Hu</a>, <a href="/search/eess?searchtype=author&query=Che%2C+H">Hui Che</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Z">Zishuo Li</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+W">Wenming Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.16384v1-abstract-short" style="display: inline;"> Ultrasound imaging is crucial for evaluating organ morphology and function, yet depth adjustment can degrade image quality and field-of-view, presenting a depth-dependent dilemma. Traditional interpolation-based zoom-in techniques often sacrifice detail and introduce artifacts. Motivated by the potential of arbitrary-scale super-resolution to naturally address these inherent challenges, we present… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.16384v1-abstract-full').style.display = 'inline'; document.getElementById('2403.16384v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.16384v1-abstract-full" style="display: none;"> Ultrasound imaging is crucial for evaluating organ morphology and function, yet depth adjustment can degrade image quality and field-of-view, presenting a depth-dependent dilemma. Traditional interpolation-based zoom-in techniques often sacrifice detail and introduce artifacts. Motivated by the potential of arbitrary-scale super-resolution to naturally address these inherent challenges, we present the Residual Dense Swin Transformer Network (RDSTN), designed to capture the non-local characteristics and long-range dependencies intrinsic to ultrasound images. It comprises a linear embedding module for feature enhancement, an encoder with shifted-window attention for modeling non-locality, and an MLP decoder for continuous detail reconstruction. This strategy streamlines balancing image quality and field-of-view, which offers superior textures over traditional methods. Experimentally, RDSTN outperforms existing approaches while requiring fewer parameters. In conclusion, RDSTN shows promising potential for ultrasound image enhancement by overcoming the limitations of conventional interpolation-based methods and achieving depth-independent imaging. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.16384v1-abstract-full').style.display = 'none'; document.getElementById('2403.16384v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICASSP2024, https://ieeexplore.ieee.org/document/10447712</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP) </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.12382">arXiv:2403.12382</a> <span> [<a href="https://arxiv.org/pdf/2403.12382">pdf</a>, <a href="https://arxiv.org/format/2403.12382">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Low-Trace Adaptation of Zero-shot Self-supervised Blind Image Denoising </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Hu%2C+J">Jintong Hu</a>, <a href="/search/eess?searchtype=author&query=Xia%2C+B">Bin Xia</a>, <a href="/search/eess?searchtype=author&query=Li%2C+B">Bingchen Li</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+W">Wenming Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.12382v1-abstract-short" style="display: inline;"> Deep learning-based denoiser has been the focus of recent development on image denoising. In the past few years, there has been increasing interest in developing self-supervised denoising networks that only require noisy images, without the need for clean ground truth for training. However, a performance gap remains between current self-supervised methods and their supervised counterparts. Additio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.12382v1-abstract-full').style.display = 'inline'; document.getElementById('2403.12382v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.12382v1-abstract-full" style="display: none;"> Deep learning-based denoiser has been the focus of recent development on image denoising. In the past few years, there has been increasing interest in developing self-supervised denoising networks that only require noisy images, without the need for clean ground truth for training. However, a performance gap remains between current self-supervised methods and their supervised counterparts. Additionally, these methods commonly depend on assumptions about noise characteristics, thereby constraining their applicability in real-world scenarios. Inspired by the properties of the Frobenius norm expansion, we discover that incorporating a trace term reduces the optimization goal disparity between self-supervised and supervised methods, thereby enhancing the performance of self-supervised learning. To exploit this insight, we propose a trace-constraint loss function and design the low-trace adaptation Noise2Noise (LoTA-N2N) model that bridges the gap between self-supervised and supervised learning. Furthermore, we have discovered that several existing self-supervised denoising frameworks naturally fall within the proposed trace-constraint loss as subcases. Extensive experiments conducted on natural and confocal image datasets indicate that our method achieves state-of-the-art performance within the realm of zero-shot self-supervised image denoising approaches, without relying on any assumptions regarding the noise. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.12382v1-abstract-full').style.display = 'none'; document.getElementById('2403.12382v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.00628">arXiv:2403.00628</a> <span> [<a href="https://arxiv.org/pdf/2403.00628">pdf</a>, <a href="https://arxiv.org/format/2403.00628">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Region-Adaptive Transform with Segmentation Prior for Image Compression </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Liu%2C+Y">Yuxi Liu</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+W">Wenhan Yang</a>, <a href="/search/eess?searchtype=author&query=Bai%2C+H">Huihui Bai</a>, <a href="/search/eess?searchtype=author&query=Wei%2C+Y">Yunchao Wei</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+Y">Yao Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.00628v4-abstract-short" style="display: inline;"> Learned Image Compression (LIC) has shown remarkable progress in recent years. Existing works commonly employ CNN-based or self-attention-based modules as transform methods for compression. However, there is no prior research on neural transform that focuses on specific regions. In response, we introduce the class-agnostic segmentation masks (i.e. semantic masks without category labels) for extrac… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.00628v4-abstract-full').style.display = 'inline'; document.getElementById('2403.00628v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.00628v4-abstract-full" style="display: none;"> Learned Image Compression (LIC) has shown remarkable progress in recent years. Existing works commonly employ CNN-based or self-attention-based modules as transform methods for compression. However, there is no prior research on neural transform that focuses on specific regions. In response, we introduce the class-agnostic segmentation masks (i.e. semantic masks without category labels) for extracting region-adaptive contextual information. Our proposed module, Region-Adaptive Transform, applies adaptive convolutions on different regions guided by the masks. Additionally, we introduce a plug-and-play module named Scale Affine Layer to incorporate rich contexts from various regions. While there have been prior image compression efforts that involve segmentation masks as additional intermediate inputs, our approach differs significantly from them. Our advantages lie in that, to avoid extra bitrate overhead, we treat these masks as privilege information, which is accessible during the model training stage but not required during the inference phase. To the best of our knowledge, we are the first to employ class-agnostic masks as privilege information and achieve superior performance in pixel-fidelity metrics, such as Peak Signal to Noise Ratio (PSNR). The experimental results demonstrate our improvement compared to previously well-performing methods, with about 8.2% bitrate saving compared to VTM-17.0. The source code is available at https://github.com/GityuxiLiu/SegPIC-for-Image-Compression. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.00628v4-abstract-full').style.display = 'none'; document.getElementById('2403.00628v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ECCV 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.18192">arXiv:2402.18192</a> <span> [<a href="https://arxiv.org/pdf/2402.18192">pdf</a>, <a href="https://arxiv.org/format/2402.18192">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Misalignment-Robust Frequency Distribution Loss for Image Transformation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ni%2C+Z">Zhangkai Ni</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+J">Juncheng Wu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Z">Zian Wang</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+W">Wenhan Yang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+H">Hanli Wang</a>, <a href="/search/eess?searchtype=author&query=Ma%2C+L">Lin Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.18192v1-abstract-short" style="display: inline;"> This paper aims to address a common challenge in deep learning-based image transformation methods, such as image enhancement and super-resolution, which heavily rely on precisely aligned paired datasets with pixel-level alignments. However, creating precisely aligned paired images presents significant challenges and hinders the advancement of methods trained on such data. To overcome this challeng… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.18192v1-abstract-full').style.display = 'inline'; document.getElementById('2402.18192v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.18192v1-abstract-full" style="display: none;"> This paper aims to address a common challenge in deep learning-based image transformation methods, such as image enhancement and super-resolution, which heavily rely on precisely aligned paired datasets with pixel-level alignments. However, creating precisely aligned paired images presents significant challenges and hinders the advancement of methods trained on such data. To overcome this challenge, this paper introduces a novel and simple Frequency Distribution Loss (FDL) for computing distribution distance within the frequency domain. Specifically, we transform image features into the frequency domain using Discrete Fourier Transformation (DFT). Subsequently, frequency components (amplitude and phase) are processed separately to form the FDL loss function. Our method is empirically proven effective as a training constraint due to the thoughtful utilization of global information in the frequency domain. Extensive experimental evaluations, focusing on image enhancement and super-resolution tasks, demonstrate that FDL outperforms existing misalignment-robust loss functions. Furthermore, we explore the potential of our FDL for image style transfer that relies solely on completely misaligned data. Our code is available at: https://github.com/eezkni/FDL <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.18192v1-abstract-full').style.display = 'none'; document.getElementById('2402.18192v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to Computer Vision and Pattern Recognition Conference (CVPR) 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.17277">arXiv:2402.17277</a> <span> [<a href="https://arxiv.org/pdf/2402.17277">pdf</a>, <a href="https://arxiv.org/format/2402.17277">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/JIOT.2024.3454223">10.1109/JIOT.2024.3454223 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> RISAR: RIS-assisted Human Activity Recognition with Commercial Wi-Fi Devices </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Liu%2C+J">Junshuo Liu</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+Y">Yunlong Huang</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+W">Wei Yang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Z">Zhe Li</a>, <a href="/search/eess?searchtype=author&query=Xiong%2C+R">Rujing Xiong</a>, <a href="/search/eess?searchtype=author&query=Mi%2C+T">Tiebin Mi</a>, <a href="/search/eess?searchtype=author&query=Shi%2C+X">Xin Shi</a>, <a href="/search/eess?searchtype=author&query=Qiu%2C+R+C">Robert C. Qiu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.17277v3-abstract-short" style="display: inline;"> Human activity recognition (HAR) holds significant importance in smart homes, security, and healthcare. Existing systems face limitations because of the insufficient spatial diversity provided by a limited number of antennas. Furthermore, inefficiencies in noise reduction and feature extraction from sensing data pose challenges to recognition performance. This study presents a reconfigurable intel… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.17277v3-abstract-full').style.display = 'inline'; document.getElementById('2402.17277v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.17277v3-abstract-full" style="display: none;"> Human activity recognition (HAR) holds significant importance in smart homes, security, and healthcare. Existing systems face limitations because of the insufficient spatial diversity provided by a limited number of antennas. Furthermore, inefficiencies in noise reduction and feature extraction from sensing data pose challenges to recognition performance. This study presents a reconfigurable intelligent surface (RIS)-assisted passive human activity recognition (RISAR) method, compatible with commercial Wi-Fi devices. RISAR leverages a RIS to enhance the spatial diversity of Wi-Fi signals, effectively capturing a wider range of information distributed across the spatial domain. A novel high-dimensional factor model based on random matrix theory is proposed to address noise reduction and feature extraction in the temporal domain. A dual-stream spatial-temporal attention network model is developed to assign variable weights to different characteristics and sequences, mimicking human cognitive processes in prioritizing essential information. Experimental analysis shows that RISAR significantly outperforms existing HAR methods in accuracy and efficiency, achieving an average accuracy of 97.26%. These findings underscore RISAR's adaptability and potential as a robust activity recognition solution in real environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.17277v3-abstract-full').style.display = 'none'; document.getElementById('2402.17277v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.15105">arXiv:2401.15105</a> <span> [<a href="https://arxiv.org/pdf/2401.15105">pdf</a>, <a href="https://arxiv.org/format/2401.15105">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Diffusion Enhancement for Cloud Removal in Ultra-Resolution Remote Sensing Imagery </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Sui%2C+J">Jialu Sui</a>, <a href="/search/eess?searchtype=author&query=Ma%2C+Y">Yiyang Ma</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+W">Wenhan Yang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+X">Xiaokang Zhang</a>, <a href="/search/eess?searchtype=author&query=Pun%2C+M">Man-On Pun</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+J">Jiaying Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.15105v1-abstract-short" style="display: inline;"> The presence of cloud layers severely compromises the quality and effectiveness of optical remote sensing (RS) images. However, existing deep-learning (DL)-based Cloud Removal (CR) techniques encounter difficulties in accurately reconstructing the original visual authenticity and detailed semantic content of the images. To tackle this challenge, this work proposes to encompass enhancements at the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.15105v1-abstract-full').style.display = 'inline'; document.getElementById('2401.15105v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.15105v1-abstract-full" style="display: none;"> The presence of cloud layers severely compromises the quality and effectiveness of optical remote sensing (RS) images. However, existing deep-learning (DL)-based Cloud Removal (CR) techniques encounter difficulties in accurately reconstructing the original visual authenticity and detailed semantic content of the images. To tackle this challenge, this work proposes to encompass enhancements at the data and methodology fronts. On the data side, an ultra-resolution benchmark named CUHK Cloud Removal (CUHK-CR) of 0.5m spatial resolution is established. This benchmark incorporates rich detailed textures and diverse cloud coverage, serving as a robust foundation for designing and assessing CR models. From the methodology perspective, a novel diffusion-based framework for CR called Diffusion Enhancement (DE) is proposed to perform progressive texture detail recovery, which mitigates the training difficulty with improved inference accuracy. Additionally, a Weight Allocation (WA) network is developed to dynamically adjust the weights for feature fusion, thereby further improving performance, particularly in the context of ultra-resolution image generation. Furthermore, a coarse-to-fine training strategy is applied to effectively expedite training convergence while reducing the computational complexity required to handle ultra-resolution images. Extensive experiments on the newly established CUHK-CR and existing datasets such as RICE confirm that the proposed DE framework outperforms existing DL-based methods in terms of both perceptual quality and signal fidelity. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.15105v1-abstract-full').style.display = 'none'; document.getElementById('2401.15105v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.02773">arXiv:2312.02773</a> <span> [<a href="https://arxiv.org/pdf/2312.02773">pdf</a>, <a href="https://arxiv.org/format/2312.02773">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Integrating Plug-and-Play Data Priors with Weighted Prediction Error for Speech Dereverberation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yang%2C+Z">Ziye Yang</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+W">Wenxing Yang</a>, <a href="/search/eess?searchtype=author&query=Xie%2C+K">Kai Xie</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+J">Jie Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.02773v1-abstract-short" style="display: inline;"> Speech dereverberation aims to alleviate the detrimental effects of late-reverberant components. While the weighted prediction error (WPE) method has shown superior performance in dereverberation, there is still room for further improvement in terms of performance and robustness in complex and noisy environments. Recent research has highlighted the effectiveness of integrating physics-based and da… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.02773v1-abstract-full').style.display = 'inline'; document.getElementById('2312.02773v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.02773v1-abstract-full" style="display: none;"> Speech dereverberation aims to alleviate the detrimental effects of late-reverberant components. While the weighted prediction error (WPE) method has shown superior performance in dereverberation, there is still room for further improvement in terms of performance and robustness in complex and noisy environments. Recent research has highlighted the effectiveness of integrating physics-based and data-driven methods, enhancing the performance of various signal processing tasks while maintaining interpretability. Motivated by these advancements, this paper presents a novel dereverberation frame-work, which incorporates data-driven methods for capturing speech priors within the WPE framework. The plug-and-play strategy (PnP), specifically the regularization by denoising (RED) strategy, is utilized to incorporate speech prior information learnt from data during the optimization problem solving iterations. Experimental results validate the effectiveness of the proposed approach. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.02773v1-abstract-full').style.display = 'none'; document.getElementById('2312.02773v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.20446">arXiv:2310.20446</a> <span> [<a href="https://arxiv.org/pdf/2310.20446">pdf</a>, <a href="https://arxiv.org/format/2310.20446">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> LAVSS: Location-Guided Audio-Visual Spatial Audio Separation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ye%2C+Y">Yuxin Ye</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+W">Wenming Yang</a>, <a href="/search/eess?searchtype=author&query=Tian%2C+Y">Yapeng Tian</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.20446v1-abstract-short" style="display: inline;"> Existing machine learning research has achieved promising results in monaural audio-visual separation (MAVS). However, most MAVS methods purely consider what the sound source is, not where it is located. This can be a problem in VR/AR scenarios, where listeners need to be able to distinguish between similar audio sources located in different directions. To address this limitation, we have generali… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.20446v1-abstract-full').style.display = 'inline'; document.getElementById('2310.20446v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.20446v1-abstract-full" style="display: none;"> Existing machine learning research has achieved promising results in monaural audio-visual separation (MAVS). However, most MAVS methods purely consider what the sound source is, not where it is located. This can be a problem in VR/AR scenarios, where listeners need to be able to distinguish between similar audio sources located in different directions. To address this limitation, we have generalized MAVS to spatial audio separation and proposed LAVSS: a location-guided audio-visual spatial audio separator. LAVSS is inspired by the correlation between spatial audio and visual location. We introduce the phase difference carried by binaural audio as spatial cues, and we utilize positional representations of sounding objects as additional modality guidance. We also leverage multi-level cross-modal attention to perform visual-positional collaboration with audio features. In addition, we adopt a pre-trained monaural separator to transfer knowledge from rich mono sounds to boost spatial audio separation. This exploits the correlation between monaural and binaural channels. Experiments on the FAIR-Play dataset demonstrate the superiority of the proposed LAVSS over existing benchmarks of audio-visual separation. Our project page: https://yyx666660.github.io/LAVSS/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.20446v1-abstract-full').style.display = 'none'; document.getElementById('2310.20446v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by WACV2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.12234">arXiv:2309.12234</a> <span> [<a href="https://arxiv.org/pdf/2309.12234">pdf</a>, <a href="https://arxiv.org/ps/2309.12234">ps</a>, <a href="https://arxiv.org/format/2309.12234">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Bridging the Gaps of Both Modality and Language: Synchronous Bilingual CTC for Speech Translation and Speech Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Xu%2C+C">Chen Xu</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+X">Xiaoqian Liu</a>, <a href="/search/eess?searchtype=author&query=He%2C+E">Erfeng He</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Y">Yuhao Zhang</a>, <a href="/search/eess?searchtype=author&query=Dong%2C+Q">Qianqian Dong</a>, <a href="/search/eess?searchtype=author&query=Xiao%2C+T">Tong Xiao</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+J">Jingbo Zhu</a>, <a href="/search/eess?searchtype=author&query=Man%2C+D">Dapeng Man</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+W">Wu Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.12234v1-abstract-short" style="display: inline;"> In this study, we present synchronous bilingual Connectionist Temporal Classification (CTC), an innovative framework that leverages dual CTC to bridge the gaps of both modality and language in the speech translation (ST) task. Utilizing transcript and translation as concurrent objectives for CTC, our model bridges the gap between audio and text as well as between source and target languages. Build… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.12234v1-abstract-full').style.display = 'inline'; document.getElementById('2309.12234v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.12234v1-abstract-full" style="display: none;"> In this study, we present synchronous bilingual Connectionist Temporal Classification (CTC), an innovative framework that leverages dual CTC to bridge the gaps of both modality and language in the speech translation (ST) task. Utilizing transcript and translation as concurrent objectives for CTC, our model bridges the gap between audio and text as well as between source and target languages. Building upon the recent advances in CTC application, we develop an enhanced variant, BiL-CTC+, that establishes new state-of-the-art performances on the MuST-C ST benchmarks under resource-constrained scenarios. Intriguingly, our method also yields significant improvements in speech recognition performance, revealing the effect of cross-lingual learning on transcription and demonstrating its broad applicability. The source code is available at https://github.com/xuchennlp/S2T. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.12234v1-abstract-full').style.display = 'none'; document.getElementById('2309.12234v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to ICASSP 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.11276">arXiv:2309.11276</a> <span> [<a href="https://arxiv.org/pdf/2309.11276">pdf</a>, <a href="https://arxiv.org/format/2309.11276">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3581783.3611955">10.1145/3581783.3611955 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Towards Real-Time Neural Video Codec for Cross-Platform Application Using Calibration Information </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Tian%2C+K">Kuan Tian</a>, <a href="/search/eess?searchtype=author&query=Guan%2C+Y">Yonghang Guan</a>, <a href="/search/eess?searchtype=author&query=Xiang%2C+J">Jinxi Xiang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+J">Jun Zhang</a>, <a href="/search/eess?searchtype=author&query=Han%2C+X">Xiao Han</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+W">Wei Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.11276v1-abstract-short" style="display: inline;"> The state-of-the-art neural video codecs have outperformed the most sophisticated traditional codecs in terms of RD performance in certain cases. However, utilizing them for practical applications is still challenging for two major reasons. 1) Cross-platform computational errors resulting from floating point operations can lead to inaccurate decoding of the bitstream. 2) The high computational com… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.11276v1-abstract-full').style.display = 'inline'; document.getElementById('2309.11276v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.11276v1-abstract-full" style="display: none;"> The state-of-the-art neural video codecs have outperformed the most sophisticated traditional codecs in terms of RD performance in certain cases. However, utilizing them for practical applications is still challenging for two major reasons. 1) Cross-platform computational errors resulting from floating point operations can lead to inaccurate decoding of the bitstream. 2) The high computational complexity of the encoding and decoding process poses a challenge in achieving real-time performance. In this paper, we propose a real-time cross-platform neural video codec, which is capable of efficiently decoding of 720P video bitstream from other encoding platforms on a consumer-grade GPU. First, to solve the problem of inconsistency of codec caused by the uncertainty of floating point calculations across platforms, we design a calibration transmitting system to guarantee the consistent quantization of entropy parameters between the encoding and decoding stages. The parameters that may have transboundary quantization between encoding and decoding are identified in the encoding stage, and their coordinates will be delivered by auxiliary transmitted bitstream. By doing so, these inconsistent parameters can be processed properly in the decoding stage. Furthermore, to reduce the bitrate of the auxiliary bitstream, we rectify the distribution of entropy parameters using a piecewise Gaussian constraint. Second, to match the computational limitations on the decoding side for real-time video codec, we design a lightweight model. A series of efficiency techniques enable our model to achieve 25 FPS decoding speed on NVIDIA RTX 2080 GPU. Experimental results demonstrate that our model can achieve real-time decoding of 720P videos while encoding on another platform. Furthermore, the real-time model brings up to a maximum of 24.2\% BD-rate improvement from the perspective of PSNR with the anchor H.265. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.11276v1-abstract-full').style.display = 'none'; document.getElementById('2309.11276v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.11139">arXiv:2309.11139</a> <span> [<a href="https://arxiv.org/pdf/2309.11139">pdf</a>, <a href="https://arxiv.org/format/2309.11139">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> More complex encoder is not all you need </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yang%2C+W">Weibin Yang</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+L">Longwei Xu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+P">Pengwei Wang</a>, <a href="/search/eess?searchtype=author&query=Geng%2C+D">Dehua Geng</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yusong Li</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+M">Mingyuan Xu</a>, <a href="/search/eess?searchtype=author&query=Dong%2C+Z">Zhiqi Dong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.11139v3-abstract-short" style="display: inline;"> U-Net and its variants have been widely used in medical image segmentation. However, most current U-Net variants confine their improvement strategies to building more complex encoder, while leaving the decoder unchanged or adopting a simple symmetric structure. These approaches overlook the true functionality of the decoder: receiving low-resolution feature maps from the encoder and restoring feat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.11139v3-abstract-full').style.display = 'inline'; document.getElementById('2309.11139v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.11139v3-abstract-full" style="display: none;"> U-Net and its variants have been widely used in medical image segmentation. However, most current U-Net variants confine their improvement strategies to building more complex encoder, while leaving the decoder unchanged or adopting a simple symmetric structure. These approaches overlook the true functionality of the decoder: receiving low-resolution feature maps from the encoder and restoring feature map resolution and lost information through upsampling. As a result, the decoder, especially its upsampling component, plays a crucial role in enhancing segmentation outcomes. However, in 3D medical image segmentation, the commonly used transposed convolution can result in visual artifacts. This issue stems from the absence of direct relationship between adjacent pixels in the output feature map. Furthermore, plain encoder has already possessed sufficient feature extraction capability because downsampling operation leads to the gradual expansion of the receptive field, but the loss of information during downsampling process is unignorable. To address the gap in relevant research, we extend our focus beyond the encoder and introduce neU-Net (i.e., not complex encoder U-Net), which incorporates a novel Sub-pixel Convolution for upsampling to construct a powerful decoder. Additionally, we introduce multi-scale wavelet inputs module on the encoder side to provide additional information. Our model design achieves excellent results, surpassing other state-of-the-art methods on both the Synapse and ACDC datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.11139v3-abstract-full').style.display = 'none'; document.getElementById('2309.11139v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.06619">arXiv:2309.06619</a> <span> [<a href="https://arxiv.org/pdf/2309.06619">pdf</a>, <a href="https://arxiv.org/format/2309.06619">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> RT-LM: Uncertainty-Aware Resource Management for Real-Time Inference of Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yufei Li</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Z">Zexin Li</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+W">Wei Yang</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+C">Cong Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.06619v1-abstract-short" style="display: inline;"> Recent advancements in language models (LMs) have gained substantial attentions on their capability to generate human-like responses. Though exhibiting a promising future for various applications such as conversation AI, these LMs face deployment challenges on various devices due to their extreme computational cost and unpredictable inference latency. Such varied inference latency, identified as a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.06619v1-abstract-full').style.display = 'inline'; document.getElementById('2309.06619v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.06619v1-abstract-full" style="display: none;"> Recent advancements in language models (LMs) have gained substantial attentions on their capability to generate human-like responses. Though exhibiting a promising future for various applications such as conversation AI, these LMs face deployment challenges on various devices due to their extreme computational cost and unpredictable inference latency. Such varied inference latency, identified as a consequence of uncertainty intrinsic to the nature of language, can lead to computational inefficiency and degrade the overall performance of LMs, especially under high-traffic workloads. Unfortunately, the bandwidth of these uncertainty sources is extensive, complicating the prediction of latency and the effects emanating from such uncertainties. To understand and mitigate the impact of uncertainty on real-time response-demanding systems, we take the first step to comprehend, quantify and optimize these uncertainty-induced latency performance variations in LMs. Specifically, we present RT-LM, an uncertainty-aware resource management ecosystem for real-time inference of LMs. RT-LM innovatively quantifies how specific input uncertainties, adversely affect latency, often leading to an increased output length. Exploiting these insights, we devise a lightweight yet effective method to dynamically correlate input text uncertainties with output length at runtime. Utilizing this quantification as a latency heuristic, we integrate the uncertainty information into a system-level scheduler which explores several uncertainty-induced optimization opportunities, including uncertainty-aware prioritization, dynamic consolidation, and strategic CPU offloading. Quantitative experiments across five state-of-the-art LMs on two hardware platforms demonstrates that RT-LM can significantly reduce the average response time and improve throughput while incurring a rather small runtime overhead. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.06619v1-abstract-full').style.display = 'none'; document.getElementById('2309.06619v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by RTSS 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.00187">arXiv:2309.00187</a> <span> [<a href="https://arxiv.org/pdf/2309.00187">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Vision-aided nonlinear control framework for shake table tests </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Chen%2C+Z">Zhongwei Chen</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+T+Y">T. Y. Yang</a>, <a href="/search/eess?searchtype=author&query=Xiao%2C+Y">Yifei Xiao</a>, <a href="/search/eess?searchtype=author&query=Pan%2C+X">Xiao Pan</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+W">Wanyan Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.00187v1-abstract-short" style="display: inline;"> The structural response under the earthquake excitations can be simulated by scaled-down model shake table tests or full-scale model shake table tests. In this paper, adaptive control theory is used as a nonlinear shake table control algorithm which considers the inherent nonlinearity of the shake table system and the Control-Structural Interaction (CSI) effect that the linear controller cannot co… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.00187v1-abstract-full').style.display = 'inline'; document.getElementById('2309.00187v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.00187v1-abstract-full" style="display: none;"> The structural response under the earthquake excitations can be simulated by scaled-down model shake table tests or full-scale model shake table tests. In this paper, adaptive control theory is used as a nonlinear shake table control algorithm which considers the inherent nonlinearity of the shake table system and the Control-Structural Interaction (CSI) effect that the linear controller cannot consider, such as the Proportional-Integral-Derivative (PID) controller. The mass of the specimen can be assumed as an unknown variation and the unknown parameter will be replaced by an estimated value in the proposed control framework. The signal generated by the control law of the adaptive control method will be implemented by a loop-shaping controller. To verify the stability and feasibility of the proposed control framework, a simulation of a bare shake table and experiments with a bare shake table with a two-story frame were carried out. This study randomly selects Earthquake recordings from the Pacific Earthquake Engineering Research Center (PEER) database. The simulation and experimental results show that the proposed control framework can be effectively used in shake table control. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.00187v1-abstract-full').style.display = 'none'; document.getElementById('2309.00187v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 7 figures, accepted in the Canadian Conference - Pacific Conference on Earthquake Engineering 2023, Vancouver, British Columbia</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.07733">arXiv:2308.07733</a> <span> [<a href="https://arxiv.org/pdf/2308.07733">pdf</a>, <a href="https://arxiv.org/format/2308.07733">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3581783.3612187">10.1145/3581783.3612187 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Dynamic Low-Rank Instance Adaptation for Universal Neural Image Compression </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Lv%2C+Y">Yue Lv</a>, <a href="/search/eess?searchtype=author&query=Xiang%2C+J">Jinxi Xiang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+J">Jun Zhang</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+W">Wenming Yang</a>, <a href="/search/eess?searchtype=author&query=Han%2C+X">Xiao Han</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+W">Wei Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.07733v1-abstract-short" style="display: inline;"> The latest advancements in neural image compression show great potential in surpassing the rate-distortion performance of conventional standard codecs. Nevertheless, there exists an indelible domain gap between the datasets utilized for training (i.e., natural images) and those utilized for inference (e.g., artistic images). Our proposal involves a low-rank adaptation approach aimed at addressing… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.07733v1-abstract-full').style.display = 'inline'; document.getElementById('2308.07733v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.07733v1-abstract-full" style="display: none;"> The latest advancements in neural image compression show great potential in surpassing the rate-distortion performance of conventional standard codecs. Nevertheless, there exists an indelible domain gap between the datasets utilized for training (i.e., natural images) and those utilized for inference (e.g., artistic images). Our proposal involves a low-rank adaptation approach aimed at addressing the rate-distortion drop observed in out-of-domain datasets. Specifically, we perform low-rank matrix decomposition to update certain adaptation parameters of the client's decoder. These updated parameters, along with image latents, are encoded into a bitstream and transmitted to the decoder in practical scenarios. Due to the low-rank constraint imposed on the adaptation parameters, the resulting bit rate overhead is small. Furthermore, the bit rate allocation of low-rank adaptation is \emph{non-trivial}, considering the diverse inputs require varying adaptation bitstreams. We thus introduce a dynamic gating network on top of the low-rank adaptation method, in order to decide which decoder layer should employ adaptation. The dynamic adaptation network is optimized end-to-end using rate-distortion loss. Our proposed method exhibits universality across diverse image datasets. Extensive results demonstrate that this paradigm significantly mitigates the domain gap, surpassing non-adaptive methods with an average BD-rate improvement of approximately $19\%$ across out-of-domain images. Furthermore, it outperforms the most advanced instance adaptive methods by roughly $5\%$ BD-rate. Ablation studies confirm our method's ability to universally enhance various image compression architectures. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.07733v1-abstract-full').style.display = 'none'; document.getElementById('2308.07733v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ACM MM 2023, 13 pages, 12 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.4.2; E.4 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2307.07710">arXiv:2307.07710</a> <span> [<a href="https://arxiv.org/pdf/2307.07710">pdf</a>, <a href="https://arxiv.org/format/2307.07710">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> ExposureDiffusion: Learning to Expose for Low-light Image Enhancement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yufei Wang</a>, <a href="/search/eess?searchtype=author&query=Yu%2C+Y">Yi Yu</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+W">Wenhan Yang</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+L">Lanqing Guo</a>, <a href="/search/eess?searchtype=author&query=Chau%2C+L">Lap-Pui Chau</a>, <a href="/search/eess?searchtype=author&query=Kot%2C+A+C">Alex C. Kot</a>, <a href="/search/eess?searchtype=author&query=Wen%2C+B">Bihan Wen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2307.07710v2-abstract-short" style="display: inline;"> Previous raw image-based low-light image enhancement methods predominantly relied on feed-forward neural networks to learn deterministic mappings from low-light to normally-exposed images. However, they failed to capture critical distribution information, leading to visually undesirable results. This work addresses the issue by seamlessly integrating a diffusion model with a physics-based exposure… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.07710v2-abstract-full').style.display = 'inline'; document.getElementById('2307.07710v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2307.07710v2-abstract-full" style="display: none;"> Previous raw image-based low-light image enhancement methods predominantly relied on feed-forward neural networks to learn deterministic mappings from low-light to normally-exposed images. However, they failed to capture critical distribution information, leading to visually undesirable results. This work addresses the issue by seamlessly integrating a diffusion model with a physics-based exposure model. Different from a vanilla diffusion model that has to perform Gaussian denoising, with the injected physics-based exposure model, our restoration process can directly start from a noisy image instead of pure noise. As such, our method obtains significantly improved performance and reduced inference time compared with vanilla diffusion models. To make full use of the advantages of different intermediate steps, we further propose an adaptive residual layer that effectively screens out the side-effect in the iterative refinement when the intermediate results have been already well-exposed. The proposed framework can work with both real-paired datasets, SOTA noise models, and different backbone networks. Note that, the proposed framework is compatible with real-paired datasets, real/synthetic noise models, and different backbone networks. We evaluate the proposed method on various public benchmarks, achieving promising results with consistent improvements using different exposure models and backbones. Besides, the proposed method achieves better generalization capacity for unseen amplifying ratios and better performance than a larger feedforward neural model when few parameters are adopted. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.07710v2-abstract-full').style.display = 'none'; document.getElementById('2307.07710v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted by ICCV2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2307.04122">arXiv:2307.04122</a> <span> [<a href="https://arxiv.org/pdf/2307.04122">pdf</a>, <a href="https://arxiv.org/format/2307.04122">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Low-Light Images Using Infrared-Encoded Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Tian%2C+S">Shulin Tian</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yufei Wang</a>, <a href="/search/eess?searchtype=author&query=Wan%2C+R">Renjie Wan</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+W">Wenhan Yang</a>, <a href="/search/eess?searchtype=author&query=Kot%2C+A+C">Alex C. Kot</a>, <a href="/search/eess?searchtype=author&query=Wen%2C+B">Bihan Wen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2307.04122v1-abstract-short" style="display: inline;"> Low-light image enhancement task is essential yet challenging as it is ill-posed intrinsically. Previous arts mainly focus on the low-light images captured in the visible spectrum using pixel-wise loss, which limits the capacity of recovering the brightness, contrast, and texture details due to the small number of income photons. In this work, we propose a novel approach to increase the visibility… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.04122v1-abstract-full').style.display = 'inline'; document.getElementById('2307.04122v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2307.04122v1-abstract-full" style="display: none;"> Low-light image enhancement task is essential yet challenging as it is ill-posed intrinsically. Previous arts mainly focus on the low-light images captured in the visible spectrum using pixel-wise loss, which limits the capacity of recovering the brightness, contrast, and texture details due to the small number of income photons. In this work, we propose a novel approach to increase the visibility of images captured under low-light environments by removing the in-camera infrared (IR) cut-off filter, which allows for the capture of more photons and results in improved signal-to-noise ratio due to the inclusion of information from the IR spectrum. To verify the proposed strategy, we collect a paired dataset of low-light images captured without the IR cut-off filter, with corresponding long-exposure reference images with an external filter. The experimental results on the proposed dataset demonstrate the effectiveness of the proposed method, showing better performance quantitatively and qualitatively. The dataset and code are publicly available at https://wyf0912.github.io/ELIEI/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.04122v1-abstract-full').style.display = 'none'; document.getElementById('2307.04122v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The first two authors contribute equally. The work is accepted by ICIP 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2307.02334">arXiv:2307.02334</a> <span> [<a href="https://arxiv.org/pdf/2307.02334">pdf</a>, <a href="https://arxiv.org/ps/2307.02334">ps</a>, <a href="https://arxiv.org/format/2307.02334">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Dual Arbitrary Scale Super-Resolution for Multi-Contrast MRI </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+J">Jiamiao Zhang</a>, <a href="/search/eess?searchtype=author&query=Chi%2C+Y">Yichen Chi</a>, <a href="/search/eess?searchtype=author&query=Lyu%2C+J">Jun Lyu</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+W">Wenming Yang</a>, <a href="/search/eess?searchtype=author&query=Tian%2C+Y">Yapeng Tian</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2307.02334v3-abstract-short" style="display: inline;"> Limited by imaging systems, the reconstruction of Magnetic Resonance Imaging (MRI) images from partial measurement is essential to medical imaging research. Benefiting from the diverse and complementary information of multi-contrast MR images in different imaging modalities, multi-contrast Super-Resolution (SR) reconstruction is promising to yield SR images with higher quality. In the medical scen… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.02334v3-abstract-full').style.display = 'inline'; document.getElementById('2307.02334v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2307.02334v3-abstract-full" style="display: none;"> Limited by imaging systems, the reconstruction of Magnetic Resonance Imaging (MRI) images from partial measurement is essential to medical imaging research. Benefiting from the diverse and complementary information of multi-contrast MR images in different imaging modalities, multi-contrast Super-Resolution (SR) reconstruction is promising to yield SR images with higher quality. In the medical scenario, to fully visualize the lesion, radiologists are accustomed to zooming the MR images at arbitrary scales rather than using a fixed scale, as used by most MRI SR methods. In addition, existing multi-contrast MRI SR methods often require a fixed resolution for the reference image, which makes acquiring reference images difficult and imposes limitations on arbitrary scale SR tasks. To address these issues, we proposed an implicit neural representations based dual-arbitrary multi-contrast MRI super-resolution method, called Dual-ArbNet. First, we decouple the resolution of the target and reference images by a feature encoder, enabling the network to input target and reference images at arbitrary scales. Then, an implicit fusion decoder fuses the multi-contrast features and uses an Implicit Decoding Function~(IDF) to obtain the final MRI SR results. Furthermore, we introduce a curriculum learning strategy to train our network, which improves the generalization and performance of our Dual-ArbNet. Extensive experiments in two public MRI datasets demonstrate that our method outperforms state-of-the-art approaches under different scale factors and has great potential in clinical practice. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.02334v3-abstract-full').style.display = 'none'; document.getElementById('2307.02334v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by MICCAI2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.16324">arXiv:2306.16324</a> <span> [<a href="https://arxiv.org/pdf/2306.16324">pdf</a>, <a href="https://arxiv.org/format/2306.16324">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DoseDiff: Distance-aware Diffusion Model for Dose Prediction in Radiotherapy </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+Y">Yiwen Zhang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+C">Chuanpu Li</a>, <a href="/search/eess?searchtype=author&query=Zhong%2C+L">Liming Zhong</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Z">Zeli Chen</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+W">Wei Yang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+X">Xuetao Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.16324v2-abstract-short" style="display: inline;"> Treatment planning, which is a critical component of the radiotherapy workflow, is typically carried out by a medical physicist in a time-consuming trial-and-error manner. Previous studies have proposed knowledge-based or deep-learning-based methods for predicting dose distribution maps to assist medical physicists in improving the efficiency of treatment planning. However, these dose prediction m… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.16324v2-abstract-full').style.display = 'inline'; document.getElementById('2306.16324v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.16324v2-abstract-full" style="display: none;"> Treatment planning, which is a critical component of the radiotherapy workflow, is typically carried out by a medical physicist in a time-consuming trial-and-error manner. Previous studies have proposed knowledge-based or deep-learning-based methods for predicting dose distribution maps to assist medical physicists in improving the efficiency of treatment planning. However, these dose prediction methods usually fail to effectively utilize distance information between surrounding tissues and targets or organs-at-risk (OARs). Moreover, they are poor at maintaining the distribution characteristics of ray paths in the predicted dose distribution maps, resulting in a loss of valuable information. In this paper, we propose a distance-aware diffusion model (DoseDiff) for precise prediction of dose distribution. We define dose prediction as a sequence of denoising steps, wherein the predicted dose distribution map is generated with the conditions of the computed tomography (CT) image and signed distance maps (SDMs). The SDMs are obtained by distance transformation from the masks of targets or OARs, which provide the distance from each pixel in the image to the outline of the targets or OARs. We further propose a multi-encoder and multi-scale fusion network (MMFNet) that incorporates multi-scale and transformer-based fusion modules to enhance information fusion between the CT image and SDMs at the feature level. We evaluate our model on two in-house datasets and a public dataset, respectively. The results demonstrate that our DoseDiff method outperforms state-of-the-art dose prediction methods in terms of both quantitative performance and visual quality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.16324v2-abstract-full').style.display = 'none'; document.getElementById('2306.16324v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.12675">arXiv:2306.12675</a> <span> [<a href="https://arxiv.org/pdf/2306.12675">pdf</a>, <a href="https://arxiv.org/format/2306.12675">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> STAR-RIS-Assisted Privacy Protection in Semantic Communication System </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yiru Wang</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+W">Wanting Yang</a>, <a href="/search/eess?searchtype=author&query=Guan%2C+P">Pengxin Guan</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+Y">Yuping Zhao</a>, <a href="/search/eess?searchtype=author&query=Xiong%2C+Z">Zehui Xiong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.12675v1-abstract-short" style="display: inline;"> Semantic communication (SemCom) has emerged as a promising architecture in the realm of intelligent communication paradigms. SemCom involves extracting and compressing the core information at the transmitter while enabling the receiver to interpret it based on established knowledge bases (KBs). This approach enhances communication efficiency greatly. However, the open nature of wireless transmissi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.12675v1-abstract-full').style.display = 'inline'; document.getElementById('2306.12675v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.12675v1-abstract-full" style="display: none;"> Semantic communication (SemCom) has emerged as a promising architecture in the realm of intelligent communication paradigms. SemCom involves extracting and compressing the core information at the transmitter while enabling the receiver to interpret it based on established knowledge bases (KBs). This approach enhances communication efficiency greatly. However, the open nature of wireless transmission and the presence of homogeneous KBs among subscribers of identical data type pose a risk of privacy leakage in SemCom. To address this challenge, we propose to leverage the simultaneous transmitting and reflecting reconfigurable intelligent surface (STAR-RIS) to achieve privacy protection in a SemCom system. In this system, the STAR-RIS is utilized to enhance the signal transmission of the SemCom between a base station and a destination user, as well as to covert the signal to interference specifically for the eavesdropper (Eve). Simulation results demonstrate that our generated task-level disturbance outperforms other benchmarks in protecting SemCom privacy, as evidenced by the significantly lower task success rate achieved by Eve. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.12675v1-abstract-full').style.display = 'none'; document.getElementById('2306.12675v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.12058">arXiv:2306.12058</a> <span> [<a href="https://arxiv.org/pdf/2306.12058">pdf</a>, <a href="https://arxiv.org/format/2306.12058">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Beyond Learned Metadata-based Raw Image Reconstruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yufei Wang</a>, <a href="/search/eess?searchtype=author&query=Yu%2C+Y">Yi Yu</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+W">Wenhan Yang</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+L">Lanqing Guo</a>, <a href="/search/eess?searchtype=author&query=Chau%2C+L">Lap-Pui Chau</a>, <a href="/search/eess?searchtype=author&query=Kot%2C+A+C">Alex C. Kot</a>, <a href="/search/eess?searchtype=author&query=Wen%2C+B">Bihan Wen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.12058v1-abstract-short" style="display: inline;"> While raw images have distinct advantages over sRGB images, e.g., linearity and fine-grained quantization levels, they are not widely adopted by general users due to their substantial storage requirements. Very recent studies propose to compress raw images by designing sampling masks within the pixel space of the raw image. However, these approaches often leave space for pursuing more effective im… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.12058v1-abstract-full').style.display = 'inline'; document.getElementById('2306.12058v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.12058v1-abstract-full" style="display: none;"> While raw images have distinct advantages over sRGB images, e.g., linearity and fine-grained quantization levels, they are not widely adopted by general users due to their substantial storage requirements. Very recent studies propose to compress raw images by designing sampling masks within the pixel space of the raw image. However, these approaches often leave space for pursuing more effective image representations and compact metadata. In this work, we propose a novel framework that learns a compact representation in the latent space, serving as metadata, in an end-to-end manner. Compared with lossy image compression, we analyze the intrinsic difference of the raw image reconstruction task caused by rich information from the sRGB image. Based on the analysis, a novel backbone design with asymmetric and hybrid spatial feature resolutions is proposed, which significantly improves the rate-distortion performance. Besides, we propose a novel design of the context model, which can better predict the order masks of encoding/decoding based on both the sRGB image and the masks of already processed features. Benefited from the better modeling of the correlation between order masks, the already processed information can be better utilized. Moreover, a novel sRGB-guided adaptive quantization precision strategy, which dynamically assigns varying levels of quantization precision to different regions, further enhances the representation ability of the model. Finally, based on the iterative properties of the proposed context model, we propose a novel strategy to achieve variable bit rates using a single model. This strategy allows for the continuous convergence of a wide range of bit rates. Extensive experimental results demonstrate that the proposed method can achieve better reconstruction quality with a smaller metadata size. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.12058v1-abstract-full').style.display = 'none'; document.getElementById('2306.12058v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.08918">arXiv:2306.08918</a> <span> [<a href="https://arxiv.org/pdf/2306.08918">pdf</a>, <a href="https://arxiv.org/format/2306.08918">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> PUGAN: Physical Model-Guided Underwater Image Enhancement Using GAN with Dual-Discriminators </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Cong%2C+R">Runmin Cong</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+W">Wenyu Yang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+W">Wei Zhang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+C">Chongyi Li</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+C">Chun-Le Guo</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+Q">Qingming Huang</a>, <a href="/search/eess?searchtype=author&query=Kwong%2C+S">Sam Kwong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.08918v1-abstract-short" style="display: inline;"> Due to the light absorption and scattering induced by the water medium, underwater images usually suffer from some degradation problems, such as low contrast, color distortion, and blurring details, which aggravate the difficulty of downstream underwater understanding tasks. Therefore, how to obtain clear and visually pleasant images has become a common concern of people, and the task of underwate… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.08918v1-abstract-full').style.display = 'inline'; document.getElementById('2306.08918v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.08918v1-abstract-full" style="display: none;"> Due to the light absorption and scattering induced by the water medium, underwater images usually suffer from some degradation problems, such as low contrast, color distortion, and blurring details, which aggravate the difficulty of downstream underwater understanding tasks. Therefore, how to obtain clear and visually pleasant images has become a common concern of people, and the task of underwater image enhancement (UIE) has also emerged as the times require. Among existing UIE methods, Generative Adversarial Networks (GANs) based methods perform well in visual aesthetics, while the physical model-based methods have better scene adaptability. Inheriting the advantages of the above two types of models, we propose a physical model-guided GAN model for UIE in this paper, referred to as PUGAN. The entire network is under the GAN architecture. On the one hand, we design a Parameters Estimation subnetwork (Par-subnet) to learn the parameters for physical model inversion, and use the generated color enhancement image as auxiliary information for the Two-Stream Interaction Enhancement sub-network (TSIE-subnet). Meanwhile, we design a Degradation Quantization (DQ) module in TSIE-subnet to quantize scene degradation, thereby achieving reinforcing enhancement of key regions. On the other hand, we design the Dual-Discriminators for the style-content adversarial constraint, promoting the authenticity and visual aesthetics of the results. Extensive experiments on three benchmark datasets demonstrate that our PUGAN outperforms state-of-the-art methods in both qualitative and quantitative metrics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.08918v1-abstract-full').style.display = 'none'; document.getElementById('2306.08918v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 4 figures, Accepted by IEEE Transactions on Image Processing 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.00794">arXiv:2306.00794</a> <span> [<a href="https://arxiv.org/pdf/2306.00794">pdf</a>, <a href="https://arxiv.org/format/2306.00794">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> SlothSpeech: Denial-of-service Attack Against Speech Recognition Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Haque%2C+M">Mirazul Haque</a>, <a href="/search/eess?searchtype=author&query=Shah%2C+R">Rutvij Shah</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+S">Simin Chen</a>, <a href="/search/eess?searchtype=author&query=%C5%9Ei%C5%9Fman%2C+B">Berrak 艦i艧man</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+C">Cong Liu</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+W">Wei Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.00794v1-abstract-short" style="display: inline;"> Deep Learning (DL) models have been popular nowadays to execute different speech-related tasks, including automatic speech recognition (ASR). As ASR is being used in different real-time scenarios, it is important that the ASR model remains efficient against minor perturbations to the input. Hence, evaluating efficiency robustness of the ASR model is the need of the hour. We show that popular ASR m… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.00794v1-abstract-full').style.display = 'inline'; document.getElementById('2306.00794v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.00794v1-abstract-full" style="display: none;"> Deep Learning (DL) models have been popular nowadays to execute different speech-related tasks, including automatic speech recognition (ASR). As ASR is being used in different real-time scenarios, it is important that the ASR model remains efficient against minor perturbations to the input. Hence, evaluating efficiency robustness of the ASR model is the need of the hour. We show that popular ASR models like Speech2Text model and Whisper model have dynamic computation based on different inputs, causing dynamic efficiency. In this work, we propose SlothSpeech, a denial-of-service attack against ASR models, which exploits the dynamic behaviour of the model. SlothSpeech uses the probability distribution of the output text tokens to generate perturbations to the audio such that efficiency of the ASR model is decreased. We find that SlothSpeech generated inputs can increase the latency up to 40X times the latency induced by benign input. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.00794v1-abstract-full').style.display = 'none'; document.getElementById('2306.00794v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.18107">arXiv:2305.18107</a> <span> [<a href="https://arxiv.org/pdf/2305.18107">pdf</a>, <a href="https://arxiv.org/format/2305.18107">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Crafting Training Degradation Distribution for the Accuracy-Generalization Trade-off in Real-World Super-Resolution </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+R">Ruofan Zhang</a>, <a href="/search/eess?searchtype=author&query=Gu%2C+J">Jinjin Gu</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+H">Haoyu Chen</a>, <a href="/search/eess?searchtype=author&query=Dong%2C+C">Chao Dong</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Y">Yulun Zhang</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+W">Wenming Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.18107v2-abstract-short" style="display: inline;"> Super-resolution (SR) techniques designed for real-world applications commonly encounter two primary challenges: generalization performance and restoration accuracy. We demonstrate that when methods are trained using complex, large-range degradations to enhance generalization, a decline in accuracy is inevitable. However, since the degradation in a certain real-world applications typically exhibit… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.18107v2-abstract-full').style.display = 'inline'; document.getElementById('2305.18107v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.18107v2-abstract-full" style="display: none;"> Super-resolution (SR) techniques designed for real-world applications commonly encounter two primary challenges: generalization performance and restoration accuracy. We demonstrate that when methods are trained using complex, large-range degradations to enhance generalization, a decline in accuracy is inevitable. However, since the degradation in a certain real-world applications typically exhibits a limited variation range, it becomes feasible to strike a trade-off between generalization performance and testing accuracy within this scope. In this work, we introduce a novel approach to craft training degradation distributions using a small set of reference images. Our strategy is founded upon the binned representation of the degradation space and the Fr茅chet distance between degradation distributions. Our results indicate that the proposed technique significantly improves the performance of test images while preserving generalization capabilities in real-world applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.18107v2-abstract-full').style.display = 'none'; document.getElementById('2305.18107v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper has been accepted to ICML 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.15357">arXiv:2305.15357</a> <span> [<a href="https://arxiv.org/pdf/2305.15357">pdf</a>, <a href="https://arxiv.org/format/2305.15357">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Solving Diffusion ODEs with Optimal Boundary Conditions for Better Image Super-Resolution </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ma%2C+Y">Yiyang Ma</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+H">Huan Yang</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+W">Wenhan Yang</a>, <a href="/search/eess?searchtype=author&query=Fu%2C+J">Jianlong Fu</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+J">Jiaying Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.15357v5-abstract-short" style="display: inline;"> Diffusion models, as a kind of powerful generative model, have given impressive results on image super-resolution (SR) tasks. However, due to the randomness introduced in the reverse process of diffusion models, the performances of diffusion-based SR models are fluctuating at every time of sampling, especially for samplers with few resampled steps. This inherent randomness of diffusion models resu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.15357v5-abstract-full').style.display = 'inline'; document.getElementById('2305.15357v5-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.15357v5-abstract-full" style="display: none;"> Diffusion models, as a kind of powerful generative model, have given impressive results on image super-resolution (SR) tasks. However, due to the randomness introduced in the reverse process of diffusion models, the performances of diffusion-based SR models are fluctuating at every time of sampling, especially for samplers with few resampled steps. This inherent randomness of diffusion models results in ineffectiveness and instability, making it challenging for users to guarantee the quality of SR results. However, our work takes this randomness as an opportunity: fully analyzing and leveraging it leads to the construction of an effective plug-and-play sampling method that owns the potential to benefit a series of diffusion-based SR methods. More in detail, we propose to steadily sample high-quality SR images from pre-trained diffusion-based SR models by solving diffusion ordinary differential equations (diffusion ODEs) with optimal boundary conditions (BCs) and analyze the characteristics between the choices of BCs and their corresponding SR results. Our analysis shows the route to obtain an approximately optimal BC via an efficient exploration in the whole space. The quality of SR results sampled by the proposed method with fewer steps outperforms the quality of results sampled by current methods with randomness from the same pre-trained diffusion-based SR model, which means that our sampling method "boosts" current diffusion-based SR models without any additional training. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.15357v5-abstract-full').style.display = 'none'; document.getElementById('2305.15357v5-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICLR 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2304.10686">arXiv:2304.10686</a> <span> [<a href="https://arxiv.org/pdf/2304.10686">pdf</a>, <a href="https://arxiv.org/format/2304.10686">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> A generalised multi-factor deep learning electricity load forecasting model for wildfire-prone areas </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yang%2C+W">Weijia Yang</a>, <a href="/search/eess?searchtype=author&query=Sparrow%2C+S+N">Sarah N. Sparrow</a>, <a href="/search/eess?searchtype=author&query=Wallom%2C+D+C+H">David C. H. Wallom</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2304.10686v1-abstract-short" style="display: inline;"> This paper proposes a generalised and robust multi-factor Gated Recurrent Unit (GRU) based Deep Learning (DL) model to forecast electricity load in distribution networks during wildfire seasons. The flexible modelling methods consider data input structure, calendar effects and correlation-based leading temperature conditions. Compared to the regular use of instantaneous temperature, the Mean Absol… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.10686v1-abstract-full').style.display = 'inline'; document.getElementById('2304.10686v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2304.10686v1-abstract-full" style="display: none;"> This paper proposes a generalised and robust multi-factor Gated Recurrent Unit (GRU) based Deep Learning (DL) model to forecast electricity load in distribution networks during wildfire seasons. The flexible modelling methods consider data input structure, calendar effects and correlation-based leading temperature conditions. Compared to the regular use of instantaneous temperature, the Mean Absolute Percentage Error (MAPE) is decreased by 30.73% by using the proposed input feature selection and leading temperature relationships. Our model is generalised and applied to eight real distribution networks in Victoria, Australia, during the wildfire seasons of 2015-2020. We demonstrate that the GRU-based model consistently outperforms another DL model, Long Short-Term Memory (LSTM), at every step, giving average improvements in Mean Squared Error (MSE) and MAPE of 10.06% and 12.86%, respectively. The sensitivity to large-scale climate variability in training data sets, e.g. El Ni帽o or La Ni帽a years, is considered to understand the possible consequences for load forecasting performance stability, showing minimal impact. Other factors such as regional poverty rate and large-scale off-peak electricity use are potential factors to further improve forecast performance. The proposed method achieves an average forecast MAPE of around 3%, giving a potential annual energy saving of AU\$80.46 million for the state of Victoria. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.10686v1-abstract-full').style.display = 'none'; document.getElementById('2304.10686v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 April, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2303.09560">arXiv:2303.09560</a> <span> [<a href="https://arxiv.org/pdf/2303.09560">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Probability">math.PR</span> </div> </div> <p class="title is-5 mathjax"> Methodology for Capacity Credit Evaluation of Physical and Virtual Energy Storage in Decarbonized Power System </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Qi%2C+N">Ning Qi</a>, <a href="/search/eess?searchtype=author&query=Li%2C+P">Peng Li</a>, <a href="/search/eess?searchtype=author&query=Cheng%2C+L">Lin Cheng</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Z">Ziyi Zhang</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+W">Wenrui Huang</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+W">Weiwei Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2303.09560v1-abstract-short" style="display: inline;"> Energy storage (ES) and virtual energy storage (VES) are key components to realizing power system decarbonization. Although ES and VES have been proven to deliver various types of grid services, little work has so far provided a systematical framework for quantifying their adequacy contribution and credible capacity value while incorporating human and market behavior. Therefore, this manuscript pr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.09560v1-abstract-full').style.display = 'inline'; document.getElementById('2303.09560v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2303.09560v1-abstract-full" style="display: none;"> Energy storage (ES) and virtual energy storage (VES) are key components to realizing power system decarbonization. Although ES and VES have been proven to deliver various types of grid services, little work has so far provided a systematical framework for quantifying their adequacy contribution and credible capacity value while incorporating human and market behavior. Therefore, this manuscript proposed a novel evaluation framework to evaluate the capacity credit (CC) of ES and VES. To address the system capacity inadequacy and market behavior of storage, a two-stage coordinated dispatch is proposed to achieve the trade-off between day-ahead self-energy management of resources and efficient adjustment to real-time failures. And we further modeled the human behavior with storage operations and incorporate two types of decision-independent uncertainties (DIUs) (operate state and self-consumption) and one type of decision-dependent uncertainty (DDUs) (available capacity) into the proposed dispatch. Furthermore, novel reliability and CC indices (e.g., equivalent physical storage capacity (EPSC)) are introduced to evaluate the practical and theoretical adequacy contribution of ES and VES, as well as the ability to displace generation and physical storage while maintaining equivalent system adequacy. Exhaustive case studies based on the IEEE RTS-79 system and real-world data verify the significant consequence (10%-70% overestimated CC) of overlooking DIUs and DDUs in the previous works, while the proposed method outperforms other and can generate a credible and realistic result. Finally, we investigate key factors affecting the adequacy contribution of ES and VES, and reasonable suggestions are provided for better flexibility utilization of ES and VES in decarbonized power system. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.09560v1-abstract-full').style.display = 'none'; document.getElementById('2303.09560v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">capacity credit, decision-dependent uncertainty, decarbonized power system</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Yang%2C+W&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Yang%2C+W&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Yang%2C+W&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Yang%2C+W&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository