Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 51 results for author: <span class="mathjax">Huang, D</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/eess" aria-role="search"> Searching in archive <strong>eess</strong>. <a href="/search/?searchtype=author&query=Huang%2C+D">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Huang, D"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Huang%2C+D&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Huang, D"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Huang%2C+D&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Huang%2C+D&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Huang%2C+D&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.12019">arXiv:2502.12019</a> <span> [<a href="https://arxiv.org/pdf/2502.12019">pdf</a>, <a href="https://arxiv.org/format/2502.12019">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Robotic CBCT Meets Robotic Ultrasound </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+F">Feng Li</a>, <a href="/search/eess?searchtype=author&query=Bi%2C+Y">Yuan Bi</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+D">Dianye Huang</a>, <a href="/search/eess?searchtype=author&query=Jiang%2C+Z">Zhongliang Jiang</a>, <a href="/search/eess?searchtype=author&query=Navab%2C+N">Nassir Navab</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.12019v1-abstract-short" style="display: inline;"> The multi-modality imaging system offers optimal fused images for safe and precise interventions in modern clinical practices, such as computed tomography - ultrasound (CT-US) guidance for needle insertion. However, the limited dexterity and mobility of current imaging devices hinder their integration into standardized workflows and the advancement toward fully autonomous intervention systems. In… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12019v1-abstract-full').style.display = 'inline'; document.getElementById('2502.12019v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.12019v1-abstract-full" style="display: none;"> The multi-modality imaging system offers optimal fused images for safe and precise interventions in modern clinical practices, such as computed tomography - ultrasound (CT-US) guidance for needle insertion. However, the limited dexterity and mobility of current imaging devices hinder their integration into standardized workflows and the advancement toward fully autonomous intervention systems. In this paper, we present a novel clinical setup where robotic cone beam computed tomography (CBCT) and robotic US are pre-calibrated and dynamically co-registered, enabling new clinical applications. This setup allows registration-free rigid registration, facilitating multi-modal guided procedures in the absence of tissue deformation. First, a one-time pre-calibration is performed between the systems. To ensure a safe insertion path by highlighting critical vasculature on the 3D CBCT, SAM2 segments vessels from B-mode images, using the Doppler signal as an autonomously generated prompt. Based on the registration, the Doppler image or segmented vessel masks are then mapped onto the CBCT, creating an optimally fused image with comprehensive detail. To validate the system, we used a specially designed phantom, featuring lesions covered by ribs and multiple vessels with simulated moving flow. The mapping error between US and CBCT resulted in an average deviation of 1.72+-0.62 mm. A user study demonstrated the effectiveness of CBCT-US fusion for needle insertion guidance, showing significant improvements in time efficiency, accuracy, and success rate. Needle intervention performance improved by approximately 50% compared to the conventional US-guided workflow. We present the first robotic dual-modality imaging system designed to guide clinical applications. The results show significant performance improvements compared to traditional manual interventions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12019v1-abstract-full').style.display = 'none'; document.getElementById('2502.12019v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09531">arXiv:2502.09531</a> <span> [<a href="https://arxiv.org/pdf/2502.09531">pdf</a>, <a href="https://arxiv.org/ps/2502.09531">ps</a>, <a href="https://arxiv.org/format/2502.09531">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Data-Enabled Predictive Control for Flexible Spacecraft </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+H">Huanqing Wang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+K">Kaixiang Zhang</a>, <a href="/search/eess?searchtype=author&query=Vahidi-Moghaddam%2C+A">Amin Vahidi-Moghaddam</a>, <a href="/search/eess?searchtype=author&query=An%2C+H">Haowei An</a>, <a href="/search/eess?searchtype=author&query=Li%2C+N">Nan Li</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+D">Daning Huang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Z">Zhaojian Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09531v1-abstract-short" style="display: inline;"> Spacecraft are vital to space exploration and are often equipped with lightweight, flexible appendages to meet strict weight constraints. These appendages pose significant challenges for modeling and control due to their inherent nonlinearity. Data-driven control methods have gained traction to address such challenges. This paper introduces, to the best of the authors' knowledge, the first applica… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09531v1-abstract-full').style.display = 'inline'; document.getElementById('2502.09531v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09531v1-abstract-full" style="display: none;"> Spacecraft are vital to space exploration and are often equipped with lightweight, flexible appendages to meet strict weight constraints. These appendages pose significant challenges for modeling and control due to their inherent nonlinearity. Data-driven control methods have gained traction to address such challenges. This paper introduces, to the best of the authors' knowledge, the first application of the data-enabled predictive control (DeePC) framework to boundary control for flexible spacecraft. Leveraging the fundamental lemma, DeePC constructs a non-parametric model by utilizing recorded past trajectories, eliminating the need for explicit model development. The developed method also incorporates dimension reduction techniques to enhance computational efficiency. Through comprehensive numerical simulations, this study compares the proposed method with Lyapunov-based control, demonstrating superior performance and offering a thorough evaluation of data-driven control for flexible spacecraft. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09531v1-abstract-full').style.display = 'none'; document.getElementById('2502.09531v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12791">arXiv:2411.12791</a> <span> [<a href="https://arxiv.org/pdf/2411.12791">pdf</a>, <a href="https://arxiv.org/format/2411.12791">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Mitigating Perception Bias: A Training-Free Approach to Enhance LMM for Image Quality Assessment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Pan%2C+S">Siyi Pan</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+B">Baoliang Chen</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+D">Danni Huang</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+H">Hanwei Zhu</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+L">Lingyu Zhu</a>, <a href="/search/eess?searchtype=author&query=Sui%2C+X">Xiangjie Sui</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+S">Shiqi Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12791v1-abstract-short" style="display: inline;"> Despite the impressive performance of large multimodal models (LMMs) in high-level visual tasks, their capacity for image quality assessment (IQA) remains limited. One main reason is that LMMs are primarily trained for high-level tasks (e.g., image captioning), emphasizing unified image semantics extraction under varied quality. Such semantic-aware yet quality-insensitive perception bias inevitabl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12791v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12791v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12791v1-abstract-full" style="display: none;"> Despite the impressive performance of large multimodal models (LMMs) in high-level visual tasks, their capacity for image quality assessment (IQA) remains limited. One main reason is that LMMs are primarily trained for high-level tasks (e.g., image captioning), emphasizing unified image semantics extraction under varied quality. Such semantic-aware yet quality-insensitive perception bias inevitably leads to a heavy reliance on image semantics when those LMMs are forced for quality rating. In this paper, instead of retraining or tuning an LMM costly, we propose a training-free debiasing framework, in which the image quality prediction is rectified by mitigating the bias caused by image semantics. Specifically, we first explore several semantic-preserving distortions that can significantly degrade image quality while maintaining identifiable semantics. By applying these specific distortions to the query or test images, we ensure that the degraded images are recognized as poor quality while their semantics remain. During quality inference, both a query image and its corresponding degraded version are fed to the LMM along with a prompt indicating that the query image quality should be inferred under the condition that the degraded one is deemed poor quality.This prior condition effectively aligns the LMM's quality perception, as all degraded images are consistently rated as poor quality, regardless of their semantic difference.Finally, the quality scores of the query image inferred under different prior conditions (degraded versions) are aggregated using a conditional probability model. Extensive experiments on various IQA datasets show that our debiasing framework could consistently enhance the LMM performance and the code will be publicly available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12791v1-abstract-full').style.display = 'none'; document.getElementById('2411.12791v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10178">arXiv:2411.10178</a> <span> [<a href="https://arxiv.org/pdf/2411.10178">pdf</a>, <a href="https://arxiv.org/format/2411.10178">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Channel-Adaptive Wireless Image Semantic Transmission with Learnable Prompts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+L">Liang Zhang</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+D">Danlan Huang</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+X">Xinyi Zhou</a>, <a href="/search/eess?searchtype=author&query=Ding%2C+F">Feng Ding</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+S">Sheng Wu</a>, <a href="/search/eess?searchtype=author&query=Wei%2C+Z">Zhiqing Wei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10178v1-abstract-short" style="display: inline;"> Recent developments in Deep learning based Joint Source-Channel Coding (DeepJSCC) have demonstrated impressive capabilities within wireless semantic communications system. However, existing DeepJSCC methodologies exhibit limited generalization ability across varying channel conditions, necessitating the preparation of multiple models. Optimal performance is only attained when the channel status du… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10178v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10178v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10178v1-abstract-full" style="display: none;"> Recent developments in Deep learning based Joint Source-Channel Coding (DeepJSCC) have demonstrated impressive capabilities within wireless semantic communications system. However, existing DeepJSCC methodologies exhibit limited generalization ability across varying channel conditions, necessitating the preparation of multiple models. Optimal performance is only attained when the channel status during testing aligns precisely with the training channel status, which is very inconvenient for real-life applications. In this paper, we introduce a novel DeepJSCC framework, termed Prompt JSCC (PJSCC), which incorporates a learnable prompt to implicitly integrate the physical channel state into the transmission system. Specifically, the Channel State Prompt (CSP) module is devised to generate prompts based on diverse SNR and channel distribution models. Through the interaction of latent image features with channel features derived from the CSP module, the DeepJSCC process dynamically adapts to varying channel conditions without necessitating retraining. Comparative analyses against leading DeepJSCC methodologies and traditional separate coding approaches reveal that the proposed PJSCC achieves optimal image reconstruction performance across different SNR settings and various channel models, as assessed by Peak Signal-to-Noise Ratio (PSNR) and Learning-based Perceptual Image Patch Similarity (LPIPS) metrics. Furthermore, in real-world scenarios, PJSCC shows excellent memory efficiency and scalability, rendering it readily deployable on resource-constrained platforms to facilitate semantic communications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10178v1-abstract-full').style.display = 'none'; document.getElementById('2411.10178v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> GLOBECOM 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09936">arXiv:2411.09936</a> <span> [<a href="https://arxiv.org/pdf/2411.09936">pdf</a>, <a href="https://arxiv.org/format/2411.09936">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> A Multi-Scale Spatial-Temporal Network for Wireless Video Transmission </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhou%2C+X">Xinyi Zhou</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+D">Danlan Huang</a>, <a href="/search/eess?searchtype=author&query=Qi%2C+Z">Zhixin Qi</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+L">Liang Zhang</a>, <a href="/search/eess?searchtype=author&query=Jiang%2C+T">Ting Jiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09936v1-abstract-short" style="display: inline;"> Deep joint source-channel coding (DeepJSCC) has shown promise in wireless transmission of text, speech, and images within the realm of semantic communication. However, wireless video transmission presents greater challenges due to the difficulty of extracting and compactly representing both spatial and temporal features, as well as its significant bandwidth and computational resource requirements.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09936v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09936v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09936v1-abstract-full" style="display: none;"> Deep joint source-channel coding (DeepJSCC) has shown promise in wireless transmission of text, speech, and images within the realm of semantic communication. However, wireless video transmission presents greater challenges due to the difficulty of extracting and compactly representing both spatial and temporal features, as well as its significant bandwidth and computational resource requirements. In response, we propose a novel video DeepJSCC (VDJSCC) approach to enable end-to-end video transmission over a wireless channel. Our approach involves the design of a multi-scale vision Transformer encoder and decoder to effectively capture spatial-temporal representations over long-term frames. Additionally, we propose a dynamic token selection module to mask less semantically important tokens from spatial or temporal dimensions, allowing for content-adaptive variable-length video coding by adjusting the token keep ratio. Experimental results demonstrate the effectiveness of our VDJSCC approach compared to digital schemes that use separate source and channel codes, as well as other DeepJSCC schemes, in terms of reconstruction quality and bandwidth reduction. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09936v1-abstract-full').style.display = 'none'; document.getElementById('2411.09936v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">2024 IEEE Global Communications Conference (GLOBECOM)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09349">arXiv:2411.09349</a> <span> [<a href="https://arxiv.org/pdf/2411.09349">pdf</a>, <a href="https://arxiv.org/format/2411.09349">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> ParaLBench: A Large-Scale Benchmark for Computational Paralinguistics over Acoustic Foundation Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+Z">Zixing Zhang</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+W">Weixiang Xu</a>, <a href="/search/eess?searchtype=author&query=Dong%2C+Z">Zhongren Dong</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+K">Kanglin Wang</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+Y">Yimeng Wu</a>, <a href="/search/eess?searchtype=author&query=Peng%2C+J">Jing Peng</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+R">Runming Wang</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+D">Dong-Yan Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09349v1-abstract-short" style="display: inline;"> Computational paralinguistics (ComParal) aims to develop algorithms and models to automatically detect, analyze, and interpret non-verbal information from speech communication, e. g., emotion, health state, age, and gender. Despite its rapid progress, it heavily depends on sophisticatedly designed models given specific paralinguistic tasks. Thus, the heterogeneity and diversity of ComParal models… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09349v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09349v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09349v1-abstract-full" style="display: none;"> Computational paralinguistics (ComParal) aims to develop algorithms and models to automatically detect, analyze, and interpret non-verbal information from speech communication, e. g., emotion, health state, age, and gender. Despite its rapid progress, it heavily depends on sophisticatedly designed models given specific paralinguistic tasks. Thus, the heterogeneity and diversity of ComParal models largely prevent the realistic implementation of ComParal models. Recently, with the advent of acoustic foundation models because of self-supervised learning, developing more generic models that can efficiently perceive a plethora of paralinguistic information has become an active topic in speech processing. However, it lacks a unified evaluation framework for a fair and consistent performance comparison. To bridge this gap, we conduct a large-scale benchmark, namely ParaLBench, which concentrates on standardizing the evaluation process of diverse paralinguistic tasks, including critical aspects of affective computing such as emotion recognition and emotion dimensions prediction, over different acoustic foundation models. This benchmark contains ten datasets with thirteen distinct paralinguistic tasks, covering short-, medium- and long-term characteristics. Each task is carried out on 14 acoustic foundation models under a unified evaluation framework, which allows for an unbiased methodological comparison and offers a grounded reference for the ComParal community. Based on the insights gained from ParaLBench, we also point out potential research directions, i.e., the cross-corpus generalizability, to propel ComParal research in the future. The code associated with this study will be available to foster the transparency and replicability of this work for succeeding researchers. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09349v1-abstract-full').style.display = 'none'; document.getElementById('2411.09349v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.08490">arXiv:2410.08490</a> <span> [<a href="https://arxiv.org/pdf/2410.08490">pdf</a>, <a href="https://arxiv.org/format/2410.08490">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> CAS-GAN for Contrast-free Angiography Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Huang%2C+D">De-Xing Huang</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+X">Xiao-Hu Zhou</a>, <a href="/search/eess?searchtype=author&query=Gui%2C+M">Mei-Jiang Gui</a>, <a href="/search/eess?searchtype=author&query=Xie%2C+X">Xiao-Liang Xie</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+S">Shi-Qi Liu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+S">Shuang-Yi Wang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+H">Hao Li</a>, <a href="/search/eess?searchtype=author&query=Xiang%2C+T">Tian-Yu Xiang</a>, <a href="/search/eess?searchtype=author&query=Hou%2C+Z">Zeng-Guang Hou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.08490v3-abstract-short" style="display: inline;"> Iodinated contrast agents are widely utilized in numerous interventional procedures, yet posing substantial health risks to patients. This paper presents CAS-GAN, a novel GAN framework that serves as a "virtual contrast agent" to synthesize X-ray angiographies via disentanglement representation learning and vessel semantic guidance, thereby reducing the reliance on iodinated contrast agents during… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08490v3-abstract-full').style.display = 'inline'; document.getElementById('2410.08490v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.08490v3-abstract-full" style="display: none;"> Iodinated contrast agents are widely utilized in numerous interventional procedures, yet posing substantial health risks to patients. This paper presents CAS-GAN, a novel GAN framework that serves as a "virtual contrast agent" to synthesize X-ray angiographies via disentanglement representation learning and vessel semantic guidance, thereby reducing the reliance on iodinated contrast agents during interventional procedures. Specifically, our approach disentangles X-ray angiographies into background and vessel components, leveraging medical prior knowledge. A specialized predictor then learns to map the interrelationships between these components. Additionally, a vessel semantic-guided generator and a corresponding loss function are introduced to enhance the visual fidelity of generated images. Experimental results on the XCAD dataset demonstrate the state-of-the-art performance of our CAS-GAN, achieving a FID of 5.87 and a MMD of 0.016. These promising results highlight CAS-GAN's potential for clinical applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08490v3-abstract-full').style.display = 'none'; document.getElementById('2410.08490v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">IEEE Symposium Series on Computational Intelligence (SSCI 2025)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.19583">arXiv:2409.19583</a> <span> [<a href="https://arxiv.org/pdf/2409.19583">pdf</a>, <a href="https://arxiv.org/format/2409.19583">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> </div> </div> <p class="title is-5 mathjax"> Brain Tumor Classification on MRI in Light of Molecular Markers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Liu%2C+J">Jun Liu</a>, <a href="/search/eess?searchtype=author&query=Yuan%2C+G">Geng Yuan</a>, <a href="/search/eess?searchtype=author&query=Zeng%2C+W">Weihao Zeng</a>, <a href="/search/eess?searchtype=author&query=Tang%2C+H">Hao Tang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+W">Wenbin Zhang</a>, <a href="/search/eess?searchtype=author&query=Lin%2C+X">Xue Lin</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+X">XiaoLin Xu</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+D">Dong Huang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yanzhi Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.19583v1-abstract-short" style="display: inline;"> In research findings, co-deletion of the 1p/19q gene is associated with clinical outcomes in low-grade gliomas. The ability to predict 1p19q status is critical for treatment planning and patient follow-up. This study aims to utilize a specially MRI-based convolutional neural network for brain cancer detection. Although public networks such as RestNet and AlexNet can effectively diagnose brain canc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19583v1-abstract-full').style.display = 'inline'; document.getElementById('2409.19583v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.19583v1-abstract-full" style="display: none;"> In research findings, co-deletion of the 1p/19q gene is associated with clinical outcomes in low-grade gliomas. The ability to predict 1p19q status is critical for treatment planning and patient follow-up. This study aims to utilize a specially MRI-based convolutional neural network for brain cancer detection. Although public networks such as RestNet and AlexNet can effectively diagnose brain cancers using transfer learning, the model includes quite a few weights that have nothing to do with medical images. As a result, the diagnostic results are unreliable by the transfer learning model. To deal with the problem of trustworthiness, we create the model from the ground up, rather than depending on a pre-trained model. To enable flexibility, we combined convolution stacking with a dropout and full connect operation, it improved performance by reducing overfitting. During model training, we also supplement the given dataset and inject Gaussian noise. We use three--fold cross-validation to train the best selection model. Comparing InceptionV3, VGG16, and MobileNetV2 fine-tuned with pre-trained models, our model produces better results. On an validation set of 125 codeletion vs. 31 not codeletion images, the proposed network achieves 96.37\% percent F1-score, 97.46\% percent precision, and 96.34\% percent recall when classifying 1p/19q codeletion and not codeletion images. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19583v1-abstract-full').style.display = 'none'; document.getElementById('2409.19583v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICAI'22 - The 24th International Conference on Artificial Intelligence, The 2022 World Congress in Computer Science, Computer Engineering, & Applied Computing (CSCE'22), Las Vegas, USA. The paper acceptance rate 17% for regular papers. The publication of the CSCE 2022 conference proceedings has been delayed due to the pandemic</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Springer Nature - Book Series: Transactions on Computational Science & Computational Intelligence, 2022 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.10500">arXiv:2408.10500</a> <span> [<a href="https://arxiv.org/pdf/2408.10500">pdf</a>, <a href="https://arxiv.org/format/2408.10500">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3689092.3689404">10.1145/3689092.3689404 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> SZTU-CMU at MER2024: Improving Emotion-LLaMA with Conv-Attention for Multimodal Emotion Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Cheng%2C+Z">Zebang Cheng</a>, <a href="/search/eess?searchtype=author&query=Tu%2C+S">Shuyuan Tu</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+D">Dawei Huang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+M">Minghan Li</a>, <a href="/search/eess?searchtype=author&query=Peng%2C+X">Xiaojiang Peng</a>, <a href="/search/eess?searchtype=author&query=Cheng%2C+Z">Zhi-Qi Cheng</a>, <a href="/search/eess?searchtype=author&query=Hauptmann%2C+A+G">Alexander G. Hauptmann</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.10500v2-abstract-short" style="display: inline;"> This paper presents our winning approach for the MER-NOISE and MER-OV tracks of the MER2024 Challenge on multimodal emotion recognition. Our system leverages the advanced emotional understanding capabilities of Emotion-LLaMA to generate high-quality annotations for unlabeled samples, addressing the challenge of limited labeled data. To enhance multimodal fusion while mitigating modality-specific n… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10500v2-abstract-full').style.display = 'inline'; document.getElementById('2408.10500v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.10500v2-abstract-full" style="display: none;"> This paper presents our winning approach for the MER-NOISE and MER-OV tracks of the MER2024 Challenge on multimodal emotion recognition. Our system leverages the advanced emotional understanding capabilities of Emotion-LLaMA to generate high-quality annotations for unlabeled samples, addressing the challenge of limited labeled data. To enhance multimodal fusion while mitigating modality-specific noise, we introduce Conv-Attention, a lightweight and efficient hybrid framework. Extensive experimentation vali-dates the effectiveness of our approach. In the MER-NOISE track, our system achieves a state-of-the-art weighted average F-score of 85.30%, surpassing the second and third-place teams by 1.47% and 1.65%, respectively. For the MER-OV track, our utilization of Emotion-LLaMA for open-vocabulary annotation yields an 8.52% improvement in average accuracy and recall compared to GPT-4V, securing the highest score among all participating large multimodal models. The code and model for Emotion-LLaMA are available at https://github.com/ZebangCheng/Emotion-LLaMA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10500v2-abstract-full').style.display = 'none'; document.getElementById('2408.10500v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Ranked 1st in MER24@IJCAI and MRAC24@ACM MM (MER-NOISE & MER-OV (self-evaluated))</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.00916">arXiv:2408.00916</a> <span> [<a href="https://arxiv.org/pdf/2408.00916">pdf</a>, <a href="https://arxiv.org/format/2408.00916">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> A reference frame-based microgrid primary control for ensuring global convergence to a periodic orbit </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Jiang%2C+X">Xinyuan Jiang</a>, <a href="/search/eess?searchtype=author&query=Lagoa%2C+C+M">Constantino M. Lagoa</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+D">Daning Huang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yan Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.00916v1-abstract-short" style="display: inline;"> Electric power systems with growing penetration of renewable generation face problems of frequency oscillation and increased uncertainty as the operating point may veer close to instability. Traditionally the stability of these systems is studied either in terms of local stability or as an angle synchronization problem under the simplifying assumption that decouples the amplitude along with all di… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.00916v1-abstract-full').style.display = 'inline'; document.getElementById('2408.00916v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.00916v1-abstract-full" style="display: none;"> Electric power systems with growing penetration of renewable generation face problems of frequency oscillation and increased uncertainty as the operating point may veer close to instability. Traditionally the stability of these systems is studied either in terms of local stability or as an angle synchronization problem under the simplifying assumption that decouples the amplitude along with all dissipations. Without the simplifying assumption, however, the steady state being studied is basically a limit cycle with the convergence of its orbit in question. In this paper we present an analysis of the orbital stability of a microgrid integrating the proposed type of distributed generation controller, whose internal reference voltage arises from the rotation of the reference frame much like a rotating machine. We utilize the shifted passivity framework to prove that, with sufficient dissipation, such system is globally convergent to a nontrivial orbit. This is the first global stability result for the limit cycle of such system in the full state space, which provides new insight into the synchronization mechanism as well as how dissipation plays a role in the orbital stability. The proposed controller is verified with a test microgrid, demonstrating its stability and transient smoothness compared to the standard droop control. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.00916v1-abstract-full').style.display = 'none'; document.getElementById('2408.00916v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.16779">arXiv:2407.16779</a> <span> [<a href="https://arxiv.org/pdf/2407.16779">pdf</a>, <a href="https://arxiv.org/format/2407.16779">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Learning Networked Dynamical System Models with Weak Form and Graph Neural Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yu%2C+Y">Yin Yu</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+D">Daning Huang</a>, <a href="/search/eess?searchtype=author&query=Park%2C+S">Seho Park</a>, <a href="/search/eess?searchtype=author&query=Pangborn%2C+H+C">Herschel C. Pangborn</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.16779v1-abstract-short" style="display: inline;"> This paper presents a sequence of two approaches for the data-driven control-oriented modeling of networked systems, i.e., the systems that involve many interacting dynamical components. First, a novel deep learning approach named the weak Latent Dynamics Model (wLDM) is developed for learning generic nonlinear dynamics with control. Leveraging the weak form, the wLDM enables more numerically stab… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.16779v1-abstract-full').style.display = 'inline'; document.getElementById('2407.16779v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.16779v1-abstract-full" style="display: none;"> This paper presents a sequence of two approaches for the data-driven control-oriented modeling of networked systems, i.e., the systems that involve many interacting dynamical components. First, a novel deep learning approach named the weak Latent Dynamics Model (wLDM) is developed for learning generic nonlinear dynamics with control. Leveraging the weak form, the wLDM enables more numerically stable and computationally efficient training as well as more accurate prediction, when compared to conventional methods such as neural ordinary differential equations. Building upon the wLDM framework, we propose the weak Graph Koopman Bilinear Form (wGKBF) model, which integrates geometric deep learning and Koopman theory to learn latent space dynamics for networked systems, especially for the challenging cases having multiple timescales. The effectiveness of the wLDM framework and wGKBF model are demonstrated on three example systems of increasing complexity - a controlled double pendulum, the stiff Brusselator dynamics, and an electrified aircraft energy system. These numerical examples show that the wLDM and wGKBF achieve superior predictive accuracy and training efficiency as compared to baseline models. Parametric studies provide insights into the effects of hyperparameters in the weak form. The proposed framework shows the capability to efficiently capture control-dependent dynamics in these systems, including stiff dynamics and multi-physics interactions, offering a promising direction for learning control-oriented models of complex networked systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.16779v1-abstract-full').style.display = 'none'; document.getElementById('2407.16779v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.19749">arXiv:2406.19749</a> <span> [<a href="https://arxiv.org/pdf/2406.19749">pdf</a>, <a href="https://arxiv.org/format/2406.19749">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SPIRONet: Spatial-Frequency Learning and Topological Channel Interaction Network for Vessel Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Huang%2C+D">De-Xing Huang</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+X">Xiao-Hu Zhou</a>, <a href="/search/eess?searchtype=author&query=Xie%2C+X">Xiao-Liang Xie</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+S">Shi-Qi Liu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+S">Shuang-Yi Wang</a>, <a href="/search/eess?searchtype=author&query=Feng%2C+Z">Zhen-Qiu Feng</a>, <a href="/search/eess?searchtype=author&query=Gui%2C+M">Mei-Jiang Gui</a>, <a href="/search/eess?searchtype=author&query=Li%2C+H">Hao Li</a>, <a href="/search/eess?searchtype=author&query=Xiang%2C+T">Tian-Yu Xiang</a>, <a href="/search/eess?searchtype=author&query=Yao%2C+B">Bo-Xian Yao</a>, <a href="/search/eess?searchtype=author&query=Hou%2C+Z">Zeng-Guang Hou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.19749v1-abstract-short" style="display: inline;"> Automatic vessel segmentation is paramount for developing next-generation interventional navigation systems. However, current approaches suffer from suboptimal segmentation performances due to significant challenges in intraoperative images (i.e., low signal-to-noise ratio, small or slender vessels, and strong interference). In this paper, a novel spatial-frequency learning and topological channel… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.19749v1-abstract-full').style.display = 'inline'; document.getElementById('2406.19749v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.19749v1-abstract-full" style="display: none;"> Automatic vessel segmentation is paramount for developing next-generation interventional navigation systems. However, current approaches suffer from suboptimal segmentation performances due to significant challenges in intraoperative images (i.e., low signal-to-noise ratio, small or slender vessels, and strong interference). In this paper, a novel spatial-frequency learning and topological channel interaction network (SPIRONet) is proposed to address the above issues. Specifically, dual encoders are utilized to comprehensively capture local spatial and global frequency vessel features. Then, a cross-attention fusion module is introduced to effectively fuse spatial and frequency features, thereby enhancing feature discriminability. Furthermore, a topological channel interaction module is designed to filter out task-irrelevant responses based on graph neural networks. Extensive experimental results on several challenging datasets (CADSA, CAXF, DCA1, and XCAD) demonstrate state-of-the-art performances of our method. Moreover, the inference speed of SPIRONet is 21 FPS with a 512x512 input size, surpassing clinical real-time requirements (6~12FPS). These promising outcomes indicate SPIRONet's potential for integration into vascular interventional navigation systems. Code is available at https://github.com/Dxhuang-CASIA/SPIRONet. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.19749v1-abstract-full').style.display = 'none'; document.getElementById('2406.19749v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.06375">arXiv:2406.06375</a> <span> [<a href="https://arxiv.org/pdf/2406.06375">pdf</a>, <a href="https://arxiv.org/format/2406.06375">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TASLP.2024.3407529">10.1109/TASLP.2024.3407529 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> MOSA: Music Motion with Semantic Annotation Dataset for Cross-Modal Music Processing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Huang%2C+Y">Yu-Fen Huang</a>, <a href="/search/eess?searchtype=author&query=Moran%2C+N">Nikki Moran</a>, <a href="/search/eess?searchtype=author&query=Coleman%2C+S">Simon Coleman</a>, <a href="/search/eess?searchtype=author&query=Kelly%2C+J">Jon Kelly</a>, <a href="/search/eess?searchtype=author&query=Wei%2C+S">Shun-Hwa Wei</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+P">Po-Yin Chen</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+Y">Yun-Hsin Huang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+T">Tsung-Ping Chen</a>, <a href="/search/eess?searchtype=author&query=Kuo%2C+Y">Yu-Chia Kuo</a>, <a href="/search/eess?searchtype=author&query=Wei%2C+Y">Yu-Chi Wei</a>, <a href="/search/eess?searchtype=author&query=Li%2C+C">Chih-Hsuan Li</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+D">Da-Yu Huang</a>, <a href="/search/eess?searchtype=author&query=Kao%2C+H">Hsuan-Kai Kao</a>, <a href="/search/eess?searchtype=author&query=Lin%2C+T">Ting-Wei Lin</a>, <a href="/search/eess?searchtype=author&query=Su%2C+L">Li Su</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.06375v1-abstract-short" style="display: inline;"> In cross-modal music processing, translation between visual, auditory, and semantic content opens up new possibilities as well as challenges. The construction of such a transformative scheme depends upon a benchmark corpus with a comprehensive data infrastructure. In particular, the assembly of a large-scale cross-modal dataset presents major challenges. In this paper, we present the MOSA (Music m… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.06375v1-abstract-full').style.display = 'inline'; document.getElementById('2406.06375v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.06375v1-abstract-full" style="display: none;"> In cross-modal music processing, translation between visual, auditory, and semantic content opens up new possibilities as well as challenges. The construction of such a transformative scheme depends upon a benchmark corpus with a comprehensive data infrastructure. In particular, the assembly of a large-scale cross-modal dataset presents major challenges. In this paper, we present the MOSA (Music mOtion with Semantic Annotation) dataset, which contains high quality 3-D motion capture data, aligned audio recordings, and note-by-note semantic annotations of pitch, beat, phrase, dynamic, articulation, and harmony for 742 professional music performances by 23 professional musicians, comprising more than 30 hours and 570 K notes of data. To our knowledge, this is the largest cross-modal music dataset with note-level annotations to date. To demonstrate the usage of the MOSA dataset, we present several innovative cross-modal music information retrieval (MIR) and musical content generation tasks, including the detection of beats, downbeats, phrase, and expressive contents from audio, video and motion data, and the generation of musicians' body motion from given music audio. The dataset and codes are available alongside this publication (https://github.com/yufenhuang/MOSA-Music-mOtion-and-Semantic-Annotation-dataset). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.06375v1-abstract-full').style.display = 'none'; document.getElementById('2406.06375v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">IEEE/ACM Transactions on Audio, Speech, and Language Processing, 2024. 14 pages, 7 figures. Dataset is available on: https://github.com/yufenhuang/MOSA-Music-mOtion-and-Semantic-Annotation-dataset/tree/main and https://zenodo.org/records/11393449</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.05170">arXiv:2406.05170</a> <span> [<a href="https://arxiv.org/pdf/2406.05170">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Other Quantitative Biology">q-bio.OT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Research on Tumors Segmentation based on Image Enhancement Method </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Huang%2C+D">Danyi Huang</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+Z">Ziang Liu</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yizhou Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.05170v1-abstract-short" style="display: inline;"> One of the most effective ways to treat liver cancer is to perform precise liver resection surgery, the key step of which includes precise digital image segmentation of the liver and its tumor. However, traditional liver parenchymal segmentation techniques often face several challenges in performing liver segmentation: lack of precision, slow processing speed, and computational burden. These short… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.05170v1-abstract-full').style.display = 'inline'; document.getElementById('2406.05170v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.05170v1-abstract-full" style="display: none;"> One of the most effective ways to treat liver cancer is to perform precise liver resection surgery, the key step of which includes precise digital image segmentation of the liver and its tumor. However, traditional liver parenchymal segmentation techniques often face several challenges in performing liver segmentation: lack of precision, slow processing speed, and computational burden. These shortcomings limit the efficiency of surgical planning and execution. In this work, the model initially describes in detail a new image enhancement algorithm that enhances the key features of an image by adaptively adjusting the contrast and brightness of the image. Then, a deep learning-based segmentation network was introduced, which was specially trained on the enhanced images to optimize the detection accuracy of tumor regions. In addition, multi-scale analysis techniques have been incorporated into the study, allowing the model to analyze images at different resolutions to capture more nuanced tumor features. In the presentation of the experimental results, the study used the 3Dircadb dataset to test the effectiveness of the proposed method. The experimental results show that compared with the traditional image segmentation method, the new method using image enhancement technology has significantly improved the accuracy and recall rate of tumor identification. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.05170v1-abstract-full').style.display = 'none'; document.getElementById('2406.05170v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.03663">arXiv:2406.03663</a> <span> [<a href="https://arxiv.org/pdf/2406.03663">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> </div> </div> <p class="title is-5 mathjax"> A Hybrid Deep Learning Classification of Perimetric Glaucoma Using Peripapillary Nerve Fiber Layer Reflectance and Other OCT Parameters from Three Anatomy Regions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Tan%2C+O">Ou Tan</a>, <a href="/search/eess?searchtype=author&query=Greenfield%2C+D+S">David S. Greenfield</a>, <a href="/search/eess?searchtype=author&query=Francis%2C+B+A">Brian A. Francis</a>, <a href="/search/eess?searchtype=author&query=Varma%2C+R">Rohit Varma</a>, <a href="/search/eess?searchtype=author&query=Schuman%2C+J+S">Joel S. Schuman</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+D">David Huang</a>, <a href="/search/eess?searchtype=author&query=Choi%2C+D">Dongseok Choi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.03663v1-abstract-short" style="display: inline;"> Precis: A hybrid deep-learning model combines NFL reflectance and other OCT parameters to improve glaucoma diagnosis. Objective: To investigate if a deep learning model could be used to combine nerve fiber layer (NFL) reflectance and other OCT parameters for glaucoma diagnosis. Patients and Methods: This is a prospective observational study where of 106 normal subjects and 164 perimetric glaucoma… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.03663v1-abstract-full').style.display = 'inline'; document.getElementById('2406.03663v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.03663v1-abstract-full" style="display: none;"> Precis: A hybrid deep-learning model combines NFL reflectance and other OCT parameters to improve glaucoma diagnosis. Objective: To investigate if a deep learning model could be used to combine nerve fiber layer (NFL) reflectance and other OCT parameters for glaucoma diagnosis. Patients and Methods: This is a prospective observational study where of 106 normal subjects and 164 perimetric glaucoma (PG) patients. Peripapillary NFL reflectance map, NFL thickness map, optic head analysis of disc, and macular ganglion cell complex thickness were obtained using spectral domain OCT. A hybrid deep learning model combined a fully connected network (FCN) and a convolution neural network (CNN) to develop and combine those OCT maps and parameters to distinguish normal and PG eyes. Two deep learning models were compared based on whether the NFL reflectance map was used as part of the input or not. Results: The hybrid deep learning model with reflectance achieved 0.909 sensitivity at 99% specificity and 0.926 at 95%. The overall accuracy was 0.948 with 0.893 sensitivity and 1.000 specificity, and the AROC was 0.979, which is significantly better than the logistic regression models (p < 0.001). The second best model is the hybrid deep learning model w/o reflectance, which also had significantly higher AROC than logistic regression models (p < 0.001). Logistic regression with reflectance model had slightly higher AROC or sensitivity than the other logistic regression model without reflectance (p = 0.024). Conclusions: Hybrid deep learning model significantly improved the diagnostic accuracy, without or without NFL reflectance. Hybrid deep learning model, combining reflectance/NFL thickness/GCC thickness/ONH parameter, may be a practical model for glaucoma screen purposes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.03663v1-abstract-full').style.display = 'none'; document.getElementById('2406.03663v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.14523">arXiv:2403.14523</a> <span> [<a href="https://arxiv.org/pdf/2403.14523">pdf</a>, <a href="https://arxiv.org/format/2403.14523">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Invisible Needle Detection in Ultrasound: Leveraging Mechanism-Induced Vibration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+C">Chenyang Li</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+D">Dianye Huang</a>, <a href="/search/eess?searchtype=author&query=Karlas%2C+A">Angelos Karlas</a>, <a href="/search/eess?searchtype=author&query=Navab%2C+N">Nassir Navab</a>, <a href="/search/eess?searchtype=author&query=Jiang%2C+Z">Zhongliang Jiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.14523v1-abstract-short" style="display: inline;"> In clinical applications that involve ultrasound-guided intervention, the visibility of the needle can be severely impeded due to steep insertion and strong distractors such as speckle noise and anatomical occlusion. To address this challenge, we propose VibNet, a learning-based framework tailored to enhance the robustness and accuracy of needle detection in ultrasound images, even when the target… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.14523v1-abstract-full').style.display = 'inline'; document.getElementById('2403.14523v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.14523v1-abstract-full" style="display: none;"> In clinical applications that involve ultrasound-guided intervention, the visibility of the needle can be severely impeded due to steep insertion and strong distractors such as speckle noise and anatomical occlusion. To address this challenge, we propose VibNet, a learning-based framework tailored to enhance the robustness and accuracy of needle detection in ultrasound images, even when the target becomes invisible to the naked eye. Inspired by Eulerian Video Magnification techniques, we utilize an external step motor to induce low-amplitude periodic motion on the needle. These subtle vibrations offer the potential to generate robust frequency features for detecting the motion patterns around the needle. To robustly and precisely detect the needle leveraging these vibrations, VibNet integrates learning-based Short-Time-Fourier-Transform and Hough-Transform modules to achieve successive sub-goals, including motion feature extraction in the spatiotemporal space, frequency feature aggregation, and needle detection in the Hough space. Based on the results obtained on distinct ex vivo porcine and bovine tissue samples, the proposed algorithm exhibits superior detection performance with efficient computation and generalization capability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.14523v1-abstract-full').style.display = 'none'; document.getElementById('2403.14523v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.11856">arXiv:2401.11856</a> <span> [<a href="https://arxiv.org/pdf/2401.11856">pdf</a>, <a href="https://arxiv.org/format/2401.11856">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MOSformer: Momentum encoder-based inter-slice fusion transformer for medical image segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Huang%2C+D">De-Xing Huang</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+X">Xiao-Hu Zhou</a>, <a href="/search/eess?searchtype=author&query=Xie%2C+X">Xiao-Liang Xie</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+S">Shi-Qi Liu</a>, <a href="/search/eess?searchtype=author&query=Feng%2C+Z">Zhen-Qiu Feng</a>, <a href="/search/eess?searchtype=author&query=Gui%2C+M">Mei-Jiang Gui</a>, <a href="/search/eess?searchtype=author&query=Li%2C+H">Hao Li</a>, <a href="/search/eess?searchtype=author&query=Xiang%2C+T">Tian-Yu Xiang</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+X">Xiu-Ling Liu</a>, <a href="/search/eess?searchtype=author&query=Hou%2C+Z">Zeng-Guang Hou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.11856v1-abstract-short" style="display: inline;"> Medical image segmentation takes an important position in various clinical applications. Deep learning has emerged as the predominant solution for automated segmentation of volumetric medical images. 2.5D-based segmentation models bridge computational efficiency of 2D-based models and spatial perception capabilities of 3D-based models. However, prevailing 2.5D-based models often treat each slice e… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.11856v1-abstract-full').style.display = 'inline'; document.getElementById('2401.11856v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.11856v1-abstract-full" style="display: none;"> Medical image segmentation takes an important position in various clinical applications. Deep learning has emerged as the predominant solution for automated segmentation of volumetric medical images. 2.5D-based segmentation models bridge computational efficiency of 2D-based models and spatial perception capabilities of 3D-based models. However, prevailing 2.5D-based models often treat each slice equally, failing to effectively learn and exploit inter-slice information, resulting in suboptimal segmentation performances. In this paper, a novel Momentum encoder-based inter-slice fusion transformer (MOSformer) is proposed to overcome this issue by leveraging inter-slice information at multi-scale feature maps extracted by different encoders. Specifically, dual encoders are employed to enhance feature distinguishability among different slices. One of the encoders is moving-averaged to maintain the consistency of slice representations. Moreover, an IF-Swin transformer module is developed to fuse inter-slice multi-scale features. The MOSformer is evaluated on three benchmark datasets (Synapse, ACDC, and AMOS), establishing a new state-of-the-art with 85.63%, 92.19%, and 85.43% of DSC, respectively. These promising results indicate its competitiveness in medical image segmentation. Codes and models of MOSformer will be made publicly available upon acceptance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.11856v1-abstract-full').style.display = 'none'; document.getElementById('2401.11856v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Under Review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.07290">arXiv:2312.07290</a> <span> [<a href="https://arxiv.org/pdf/2312.07290">pdf</a>, <a href="https://arxiv.org/format/2312.07290">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Underwater motions analysis and control of a coupling-tiltable unmanned aerial-aquatic quadrotor </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Huang%2C+D">Dongyue Huang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+C">Chenggang Wang</a>, <a href="/search/eess?searchtype=author&query=Dou%2C+M">Minghao Dou</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+X">Xuchen Liu</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+Z">Zixuan Liu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+B">Biao Wang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+B+M">Ben M. Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.07290v1-abstract-short" style="display: inline;"> This paper proposes a method for analyzing a series of potential motions in a coupling-tiltable aerial-aquatic quadrotor based on its nonlinear dynamics. Some characteristics and constraints derived by this method are specified as Singular Thrust Tilt Angles (STTAs), utilizing to generate motions including planar motions. A switch-based control scheme addresses issues of control direction uncertai… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.07290v1-abstract-full').style.display = 'inline'; document.getElementById('2312.07290v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.07290v1-abstract-full" style="display: none;"> This paper proposes a method for analyzing a series of potential motions in a coupling-tiltable aerial-aquatic quadrotor based on its nonlinear dynamics. Some characteristics and constraints derived by this method are specified as Singular Thrust Tilt Angles (STTAs), utilizing to generate motions including planar motions. A switch-based control scheme addresses issues of control direction uncertainty inherent to the mechanical structure by incorporating a saturated Nussbaum function. A high-fidelity simulation environment incorporating a comprehensive hydrodynamic model is built based on a Hardware-In-The-Loop (HITL) setup with Gazebo and a flight control board. The experiments validate the effectiveness of the absolute and quasi planar motions, which cannot be achieved by conventional quadrotors, and demonstrate stable performance when the pitch or roll angle is activated in the auxiliary control channel. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.07290v1-abstract-full').style.display = 'none'; document.getElementById('2312.07290v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Unmanned Aerial-Aquatic Vehicle</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.03231">arXiv:2312.03231</a> <span> [<a href="https://arxiv.org/pdf/2312.03231">pdf</a>, <a href="https://arxiv.org/format/2312.03231">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Deep Multimodal Fusion for Surgical Feedback Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Kocielnik%2C+R">Rafal Kocielnik</a>, <a href="/search/eess?searchtype=author&query=Wong%2C+E+Y">Elyssa Y. Wong</a>, <a href="/search/eess?searchtype=author&query=Chu%2C+T+N">Timothy N. Chu</a>, <a href="/search/eess?searchtype=author&query=Lin%2C+L">Lydia Lin</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+D">De-An Huang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+J">Jiayun Wang</a>, <a href="/search/eess?searchtype=author&query=Anandkumar%2C+A">Anima Anandkumar</a>, <a href="/search/eess?searchtype=author&query=Hung%2C+A+J">Andrew J. Hung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.03231v1-abstract-short" style="display: inline;"> Quantification of real-time informal feedback delivered by an experienced surgeon to a trainee during surgery is important for skill improvements in surgical training. Such feedback in the live operating room is inherently multimodal, consisting of verbal conversations (e.g., questions and answers) as well as non-verbal elements (e.g., through visual cues like pointing to anatomic elements). In th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.03231v1-abstract-full').style.display = 'inline'; document.getElementById('2312.03231v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.03231v1-abstract-full" style="display: none;"> Quantification of real-time informal feedback delivered by an experienced surgeon to a trainee during surgery is important for skill improvements in surgical training. Such feedback in the live operating room is inherently multimodal, consisting of verbal conversations (e.g., questions and answers) as well as non-verbal elements (e.g., through visual cues like pointing to anatomic elements). In this work, we leverage a clinically-validated five-category classification of surgical feedback: "Anatomic", "Technical", "Procedural", "Praise" and "Visual Aid". We then develop a multi-label machine learning model to classify these five categories of surgical feedback from inputs of text, audio, and video modalities. The ultimate goal of our work is to help automate the annotation of real-time contextual surgical feedback at scale. Our automated classification of surgical feedback achieves AUCs ranging from 71.5 to 77.6 with the fusion improving performance by 3.1%. We also show that high-quality manual transcriptions of feedback audio from experts improve AUCs to between 76.5 and 96.2, which demonstrates a clear path toward future improvements. Empirically, we find that the Staged training strategy, with first pre-training each modality separately and then training them jointly, is more effective than training different modalities altogether. We also present intuitive findings on the importance of modalities for different feedback categories. This work offers an important first look at the feasibility of automated classification of real-world live surgical feedback based on text, audio, and video modalities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.03231v1-abstract-full').style.display = 'none'; document.getElementById('2312.03231v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Published in Proceedings of Machine Learning for Health 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.05929">arXiv:2311.05929</a> <span> [<a href="https://arxiv.org/pdf/2311.05929">pdf</a>, <a href="https://arxiv.org/format/2311.05929">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Efficient Segmentation with Texture in Ore Images Based on Box-supervised Approach </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Sun%2C+G">Guodong Sun</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+D">Delong Huang</a>, <a href="/search/eess?searchtype=author&query=Peng%2C+Y">Yuting Peng</a>, <a href="/search/eess?searchtype=author&query=Cheng%2C+L">Le Cheng</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+B">Bo Wu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Y">Yang Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.05929v1-abstract-short" style="display: inline;"> Image segmentation methods have been utilized to determine the particle size distribution of crushed ores. Due to the complex working environment, high-powered computing equipment is difficult to deploy. At the same time, the ore distribution is stacked, and it is difficult to identify the complete features. To address this issue, an effective box-supervised technique with texture features is prov… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.05929v1-abstract-full').style.display = 'inline'; document.getElementById('2311.05929v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.05929v1-abstract-full" style="display: none;"> Image segmentation methods have been utilized to determine the particle size distribution of crushed ores. Due to the complex working environment, high-powered computing equipment is difficult to deploy. At the same time, the ore distribution is stacked, and it is difficult to identify the complete features. To address this issue, an effective box-supervised technique with texture features is provided for ore image segmentation that can identify complete and independent ores. Firstly, a ghost feature pyramid network (Ghost-FPN) is proposed to process the features obtained from the backbone to reduce redundant semantic information and computation generated by complex networks. Then, an optimized detection head is proposed to obtain the feature to maintain accuracy. Finally, Lab color space (Lab) and local binary patterns (LBP) texture features are combined to form a fusion feature similarity-based loss function to improve accuracy while incurring no loss. Experiments on MS COCO have shown that the proposed fusion features are also worth studying on other types of datasets. Extensive experimental results demonstrate the effectiveness of the proposed method, which achieves over 50 frames per second with a small model size of 21.6 MB. Meanwhile, the method maintains a high level of accuracy compared with the state-of-the-art approaches on ore image dataset. The source code is available at \url{https://github.com/MVME-HBUT/OREINST}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.05929v1-abstract-full').style.display = 'none'; document.getElementById('2311.05929v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages, 8 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.02782">arXiv:2308.02782</a> <span> [<a href="https://arxiv.org/pdf/2308.02782">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optics">physics.optics</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1364/OL.501622">10.1364/OL.501622 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Non-line-of-sight reconstruction via structure sparsity regularization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Huang%2C+D">Duolan Huang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Q">Quan Chen</a>, <a href="/search/eess?searchtype=author&query=Wei%2C+Z">Zhun Wei</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+R">Rui Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.02782v1-abstract-short" style="display: inline;"> Non-line-of-sight (NLOS) imaging allows for the imaging of objects around a corner, which enables potential applications in various fields such as autonomous driving, robotic vision, medical imaging, security monitoring, etc. However, the quality of reconstruction is challenged by low signal-noise-ratio (SNR) measurements. In this study, we present a regularization method, referred to as structure… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.02782v1-abstract-full').style.display = 'inline'; document.getElementById('2308.02782v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.02782v1-abstract-full" style="display: none;"> Non-line-of-sight (NLOS) imaging allows for the imaging of objects around a corner, which enables potential applications in various fields such as autonomous driving, robotic vision, medical imaging, security monitoring, etc. However, the quality of reconstruction is challenged by low signal-noise-ratio (SNR) measurements. In this study, we present a regularization method, referred to as structure sparsity (SS) regularization, for denoising in NLOS reconstruction. By exploiting the prior knowledge of structure sparseness, we incorporate nuclear norm penalization into the cost function of directional light-cone transform (DLCT) model for NLOS imaging system. This incorporation effectively integrates the neighborhood information associated with the directional albedo, thereby facilitating the denoising process. Subsequently, the reconstruction is achieved by optimizing a directional albedo model with SS regularization using fast iterative shrinkage-thresholding algorithm. Notably, the robust reconstruction of occluded objects is observed. Through comprehensive evaluations conducted on both synthetic and experimental datasets, we demonstrate that the proposed approach yields high-quality reconstructions, surpassing the state-of-the-art reconstruction algorithms, especially in scenarios involving short exposure and low SNR measurements. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.02782v1-abstract-full').style.display = 'none'; document.getElementById('2308.02782v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2307.05884">arXiv:2307.05884</a> <span> [<a href="https://arxiv.org/pdf/2307.05884">pdf</a>, <a href="https://arxiv.org/format/2307.05884">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Learning Koopman Operators with Control Using Bi-level Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Huang%2C+D">Daning Huang</a>, <a href="/search/eess?searchtype=author&query=Prasetyo%2C+M+B">Muhammad Bayu Prasetyo</a>, <a href="/search/eess?searchtype=author&query=Yu%2C+Y">Yin Yu</a>, <a href="/search/eess?searchtype=author&query=Geng%2C+J">Junyi Geng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2307.05884v2-abstract-short" style="display: inline;"> The accurate modeling and control of nonlinear dynamical effects are crucial for numerous robotic systems. The Koopman formalism emerges as a valuable tool for linear control design in nonlinear systems within unknown environments. However, it still remains a challenging task to learn the Koopman operator with control from data, and in particular, the simultaneous identification of the Koopman lin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.05884v2-abstract-full').style.display = 'inline'; document.getElementById('2307.05884v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2307.05884v2-abstract-full" style="display: none;"> The accurate modeling and control of nonlinear dynamical effects are crucial for numerous robotic systems. The Koopman formalism emerges as a valuable tool for linear control design in nonlinear systems within unknown environments. However, it still remains a challenging task to learn the Koopman operator with control from data, and in particular, the simultaneous identification of the Koopman linear dynamics and the mapping between the physical and Koopman states. Conventionally, the simultaneous learning of the dynamics and mapping is achieved via single-level optimization based on one-step or multi-step discrete-time predictions, but the learned model may lack model robustness, training efficiency, and/or long-term predictive accuracy. This paper presents a bi-level optimization framework that jointly learns the Koopman embedding mapping and Koopman dynamics with exact long-term dynamical constraints. Our formulation allows back-propagation in standard learning framework and the use of state-of-the-art optimizers, yielding more accurate and stable system prediction in long-time horizon over various applications compared to conventional methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.05884v2-abstract-full').style.display = 'none'; document.getElementById('2307.05884v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by 2023 IEEE 62nd Conference on Decision and Control (CDC)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2307.04101">arXiv:2307.04101</a> <span> [<a href="https://arxiv.org/pdf/2307.04101">pdf</a>, <a href="https://arxiv.org/format/2307.04101">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Building Semantic Segmentation Accuracy with Super Resolution and Deep Learning: Investigating the Impact of Spatial Resolution on Various Datasets </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Guo%2C+Z">Zhiling Guo</a>, <a href="/search/eess?searchtype=author&query=Shi%2C+X">Xiaodan Shi</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+H">Haoran Zhang</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+D">Dou Huang</a>, <a href="/search/eess?searchtype=author&query=Song%2C+X">Xiaoya Song</a>, <a href="/search/eess?searchtype=author&query=Yan%2C+J">Jinyue Yan</a>, <a href="/search/eess?searchtype=author&query=Shibasaki%2C+R">Ryosuke Shibasaki</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2307.04101v1-abstract-short" style="display: inline;"> The development of remote sensing and deep learning techniques has enabled building semantic segmentation with high accuracy and efficiency. Despite their success in different tasks, the discussions on the impact of spatial resolution on deep learning based building semantic segmentation are quite inadequate, which makes choosing a higher cost-effective data source a big challenge. To address the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.04101v1-abstract-full').style.display = 'inline'; document.getElementById('2307.04101v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2307.04101v1-abstract-full" style="display: none;"> The development of remote sensing and deep learning techniques has enabled building semantic segmentation with high accuracy and efficiency. Despite their success in different tasks, the discussions on the impact of spatial resolution on deep learning based building semantic segmentation are quite inadequate, which makes choosing a higher cost-effective data source a big challenge. To address the issue mentioned above, in this study, we create remote sensing images among three study areas into multiple spatial resolutions by super-resolution and down-sampling. After that, two representative deep learning architectures: UNet and FPN, are selected for model training and testing. The experimental results obtained from three cities with two deep learning models indicate that the spatial resolution greatly influences building segmentation results, and with a better cost-effectiveness around 0.3m, which we believe will be an important insight for data selection and preparation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.04101v1-abstract-full').style.display = 'none'; document.getElementById('2307.04101v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2307.03698">arXiv:2307.03698</a> <span> [<a href="https://arxiv.org/pdf/2307.03698">pdf</a>, <a href="https://arxiv.org/format/2307.03698">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Motion Magnification in Robotic Sonography: Enabling Pulsation-Aware Artery Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Huang%2C+D">Dianye Huang</a>, <a href="/search/eess?searchtype=author&query=Bi%2C+Y">Yuan Bi</a>, <a href="/search/eess?searchtype=author&query=Navab%2C+N">Nassir Navab</a>, <a href="/search/eess?searchtype=author&query=Jiang%2C+Z">Zhongliang Jiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2307.03698v1-abstract-short" style="display: inline;"> Ultrasound (US) imaging is widely used for diagnosing and monitoring arterial diseases, mainly due to the advantages of being non-invasive, radiation-free, and real-time. In order to provide additional information to assist clinicians in diagnosis, the tubular structures are often segmented from US images. To improve the artery segmentation accuracy and stability during scans, this work presents a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.03698v1-abstract-full').style.display = 'inline'; document.getElementById('2307.03698v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2307.03698v1-abstract-full" style="display: none;"> Ultrasound (US) imaging is widely used for diagnosing and monitoring arterial diseases, mainly due to the advantages of being non-invasive, radiation-free, and real-time. In order to provide additional information to assist clinicians in diagnosis, the tubular structures are often segmented from US images. To improve the artery segmentation accuracy and stability during scans, this work presents a novel pulsation-assisted segmentation neural network (PAS-NN) by explicitly taking advantage of the cardiac-induced motions. Motion magnification techniques are employed to amplify the subtle motion within the frequency band of interest to extract the pulsation signals from sequential US images. The extracted real-time pulsation information can help to locate the arteries on cross-section US images; therefore, we explicitly integrated the pulsation into the proposed PAS-NN as attention guidance. Notably, a robotic arm is necessary to provide stable movement during US imaging since magnifying the target motions from the US images captured along a scan path is not manually feasible due to the hand tremor. To validate the proposed robotic US system for imaging arteries, experiments are carried out on volunteers' carotid and radial arteries. The results demonstrated that the PAS-NN could achieve comparable results as state-of-the-art on carotid and can effectively improve the segmentation performance for small vessels (radial artery). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.03698v1-abstract-full').style.display = 'none'; document.getElementById('2307.03698v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted Paper IROS 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.16622">arXiv:2306.16622</a> <span> [<a href="https://arxiv.org/pdf/2306.16622">pdf</a>, <a href="https://arxiv.org/format/2306.16622">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/JIOT.2024.3361892">10.1109/JIOT.2024.3361892 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Deep Learning Methods for Device Identification Using Symbols Trace Plot </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Huang%2C+D">Da Huang</a>, <a href="/search/eess?searchtype=author&query=Al-Hourani%2C+A">Akram Al-Hourani</a>, <a href="/search/eess?searchtype=author&query=Sithamparanathan%2C+K">Kandeepan Sithamparanathan</a>, <a href="/search/eess?searchtype=author&query=Rowe%2C+W+S+T">Wayne S. T. Rowe</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.16622v2-abstract-short" style="display: inline;"> Devices authentication is one crucial aspect of any communication system. Recently, the physical layer approach radio frequency (RF) fingerprinting has gained increased interest as it provides an extra layer of security without requiring additional components. In this work, we propose an RF fingerprinting based transmitter authentication approach density trace plot (DTP) to exploit device-identifi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.16622v2-abstract-full').style.display = 'inline'; document.getElementById('2306.16622v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.16622v2-abstract-full" style="display: none;"> Devices authentication is one crucial aspect of any communication system. Recently, the physical layer approach radio frequency (RF) fingerprinting has gained increased interest as it provides an extra layer of security without requiring additional components. In this work, we propose an RF fingerprinting based transmitter authentication approach density trace plot (DTP) to exploit device-identifiable fingerprints. By considering IQ imbalance solely as the feature source, DTP can efficiently extract device-identifiable fingerprints from symbol transition trajectories and density center drifts. In total, three DTP modalities based on constellation, eye and phase traces are respectively generated and tested against three deep learning classifiers: the 2D-CNN, 2D-CNN+biLSTM and 3D-CNN. The feasibility of these DTP and classifier pairs is verified using a practical dataset collected from the ADALM-PLUTO software-defined radios (SDRs). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.16622v2-abstract-full').style.display = 'none'; document.getElementById('2306.16622v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.18234">arXiv:2305.18234</a> <span> [<a href="https://arxiv.org/pdf/2305.18234">pdf</a>, <a href="https://arxiv.org/format/2305.18234">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Temporal Aware Mixed Attention-based Convolution and Transformer Network (MACTN) for EEG Emotion Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Si%2C+X">Xiaopeng Si</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+D">Dong Huang</a>, <a href="/search/eess?searchtype=author&query=Sun%2C+Y">Yulin Sun</a>, <a href="/search/eess?searchtype=author&query=Ming%2C+D">Dong Ming</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.18234v1-abstract-short" style="display: inline;"> Emotion recognition plays a crucial role in human-computer interaction, and electroencephalography (EEG) is advantageous for reflecting human emotional states. In this study, we propose MACTN, a hierarchical hybrid model for jointly modeling local and global temporal information. The model is inspired by neuroscience research on the temporal dynamics of emotions. MACTN extracts local emotional fea… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.18234v1-abstract-full').style.display = 'inline'; document.getElementById('2305.18234v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.18234v1-abstract-full" style="display: none;"> Emotion recognition plays a crucial role in human-computer interaction, and electroencephalography (EEG) is advantageous for reflecting human emotional states. In this study, we propose MACTN, a hierarchical hybrid model for jointly modeling local and global temporal information. The model is inspired by neuroscience research on the temporal dynamics of emotions. MACTN extracts local emotional features through a convolutional neural network (CNN) and integrates sparse global emotional features through a transformer. Moreover, we employ channel attention mechanisms to identify the most task-relevant channels. Through extensive experimentation on two publicly available datasets, namely THU-EP and DEAP, our proposed method, MACTN, consistently achieves superior classification accuracy and F1 scores compared to other existing methods in most experimental settings. Furthermore, ablation studies have shown that the integration of both self-attention mechanisms and channel attention mechanisms leads to improved classification performance. Finally, an earlier version of this method, which shares the same ideas, won the Emotional BCI Competition's final championship in the 2022 World Robot Contest. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.18234v1-abstract-full').style.display = 'none'; document.getElementById('2305.18234v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.14781">arXiv:2305.14781</a> <span> [<a href="https://arxiv.org/pdf/2305.14781">pdf</a>, <a href="https://arxiv.org/format/2305.14781">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Accelerated Nonconvex ADMM with Self-Adaptive Penalty for Rank-Constrained Model Identification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Liu%2C+Q">Qingyuan Liu</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+Z">Zhengchao Huang</a>, <a href="/search/eess?searchtype=author&query=Ye%2C+H">Hao Ye</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+D">Dexian Huang</a>, <a href="/search/eess?searchtype=author&query=Shang%2C+C">Chao Shang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.14781v2-abstract-short" style="display: inline;"> The alternating direction method of multipliers (ADMM) has been widely adopted in low-rank approximation and low-order model identification tasks; however, the performance of nonconvex ADMM is highly reliant on the choice of penalty parameter. To accelerate ADMM for solving rank-constrained identification problems, this paper proposes a new self-adaptive strategy for automatic penalty update. Guid… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.14781v2-abstract-full').style.display = 'inline'; document.getElementById('2305.14781v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.14781v2-abstract-full" style="display: none;"> The alternating direction method of multipliers (ADMM) has been widely adopted in low-rank approximation and low-order model identification tasks; however, the performance of nonconvex ADMM is highly reliant on the choice of penalty parameter. To accelerate ADMM for solving rank-constrained identification problems, this paper proposes a new self-adaptive strategy for automatic penalty update. Guided by first-order analysis of the increment of the augmented Lagrangian, the self-adaptive penalty updating enables effective and balanced minimization of both primal and dual residuals and thus ensures a stable convergence. Moreover, improved efficiency can be obtained within the Anderson acceleration scheme. Numerical examples show that the proposed strategy significantly accelerates the convergence of nonconvex ADMM while alleviating the critical reliance on tedious tuning of penalty parameters. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.14781v2-abstract-full').style.display = 'none'; document.getElementById('2305.14781v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages, 5 figures. Accepted by 62nd IEEE Conference on Decision and Control (CDC 2023)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.08408">arXiv:2305.08408</a> <span> [<a href="https://arxiv.org/pdf/2305.08408">pdf</a>, <a href="https://arxiv.org/format/2305.08408">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> SB-VQA: A Stack-Based Video Quality Assessment Framework for Video Enhancement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Huang%2C+D">Ding-Jiun Huang</a>, <a href="/search/eess?searchtype=author&query=Kao%2C+Y">Yu-Ting Kao</a>, <a href="/search/eess?searchtype=author&query=Chuang%2C+T">Tieh-Hung Chuang</a>, <a href="/search/eess?searchtype=author&query=Tsai%2C+Y">Ya-Chun Tsai</a>, <a href="/search/eess?searchtype=author&query=Lou%2C+J">Jing-Kai Lou</a>, <a href="/search/eess?searchtype=author&query=Guan%2C+S">Shuen-Huei Guan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.08408v1-abstract-short" style="display: inline;"> In recent years, several video quality assessment (VQA) methods have been developed, achieving high performance. However, these methods were not specifically trained for enhanced videos, which limits their ability to predict video quality accurately based on human subjective perception. To address this issue, we propose a stack-based framework for VQA that outperforms existing state-of-the-art met… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.08408v1-abstract-full').style.display = 'inline'; document.getElementById('2305.08408v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.08408v1-abstract-full" style="display: none;"> In recent years, several video quality assessment (VQA) methods have been developed, achieving high performance. However, these methods were not specifically trained for enhanced videos, which limits their ability to predict video quality accurately based on human subjective perception. To address this issue, we propose a stack-based framework for VQA that outperforms existing state-of-the-art methods on VDPVE, a dataset consisting of enhanced videos. In addition to proposing the VQA framework for enhanced videos, we also investigate its application on professionally generated content (PGC). To address copyright issues with premium content, we create the PGCVQ dataset, which consists of videos from YouTube. We evaluate our proposed approach and state-of-the-art methods on PGCVQ, and provide new insights on the results. Our experiments demonstrate that existing VQA algorithms can be applied to PGC videos, and we find that VQA performance for PGC videos can be improved by considering the plot of a play, which highlights the importance of video semantic understanding. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.08408v1-abstract-full').style.display = 'none'; document.getElementById('2305.08408v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CVPR NTIRE 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2301.12344">arXiv:2301.12344</a> <span> [<a href="https://arxiv.org/pdf/2301.12344">pdf</a>, <a href="https://arxiv.org/format/2301.12344">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> TJ-FlyingFish: Design and Implementation of an Aerial-Aquatic Quadrotor with Tiltable Propulsion Units </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Liu%2C+X">Xuchen Liu</a>, <a href="/search/eess?searchtype=author&query=Dou%2C+M">Minghao Dou</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+D">Dongyue Huang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+B">Biao Wang</a>, <a href="/search/eess?searchtype=author&query=Cui%2C+J">Jinqiang Cui</a>, <a href="/search/eess?searchtype=author&query=Ren%2C+Q">Qinyuan Ren</a>, <a href="/search/eess?searchtype=author&query=Dou%2C+L">Lihua Dou</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+Z">Zhi Gao</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+J">Jie Chen</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+B+M">Ben M. Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2301.12344v2-abstract-short" style="display: inline;"> Aerial-aquatic vehicles are capable to move in the two most dominant fluids, making them more promising for a wide range of applications. We propose a prototype with special designs for propulsion and thruster configuration to cope with the vast differences in the fluid properties of water and air. For propulsion, the operating range is switched for the different mediums by the dual-speed propulsi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2301.12344v2-abstract-full').style.display = 'inline'; document.getElementById('2301.12344v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2301.12344v2-abstract-full" style="display: none;"> Aerial-aquatic vehicles are capable to move in the two most dominant fluids, making them more promising for a wide range of applications. We propose a prototype with special designs for propulsion and thruster configuration to cope with the vast differences in the fluid properties of water and air. For propulsion, the operating range is switched for the different mediums by the dual-speed propulsion unit, providing sufficient thrust and also ensuring output efficiency. For thruster configuration, thrust vectoring is realized by the rotation of the propulsion unit around the mount arm, thus enhancing the underwater maneuverability. This paper presents a quadrotor prototype of this concept and the design details and realization in practice. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2301.12344v2-abstract-full').style.display = 'none'; document.getElementById('2301.12344v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 February, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 January, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">6 pages, 9 figures, accepted to 2023 IEEE International Conference on Robotics and Automation (ICRA)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2212.06299">arXiv:2212.06299</a> <span> [<a href="https://arxiv.org/pdf/2212.06299">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Interpretable Diabetic Retinopathy Diagnosis based on Biomarker Activation Map </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zang%2C+P">Pengxiao Zang</a>, <a href="/search/eess?searchtype=author&query=Hormel%2C+T+T">Tristan T. Hormel</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+J">Jie Wang</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+Y">Yukun Guo</a>, <a href="/search/eess?searchtype=author&query=Bailey%2C+S+T">Steven T. Bailey</a>, <a href="/search/eess?searchtype=author&query=Flaxel%2C+C+J">Christina J. Flaxel</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+D">David Huang</a>, <a href="/search/eess?searchtype=author&query=Hwang%2C+T+S">Thomas S. Hwang</a>, <a href="/search/eess?searchtype=author&query=Jia%2C+Y">Yali Jia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2212.06299v3-abstract-short" style="display: inline;"> Deep learning classifiers provide the most accurate means of automatically diagnosing diabetic retinopathy (DR) based on optical coherence tomography (OCT) and its angiography (OCTA). The power of these models is attributable in part to the inclusion of hidden layers that provide the complexity required to achieve a desired task. However, hidden layers also render algorithm outputs difficult to in… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.06299v3-abstract-full').style.display = 'inline'; document.getElementById('2212.06299v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2212.06299v3-abstract-full" style="display: none;"> Deep learning classifiers provide the most accurate means of automatically diagnosing diabetic retinopathy (DR) based on optical coherence tomography (OCT) and its angiography (OCTA). The power of these models is attributable in part to the inclusion of hidden layers that provide the complexity required to achieve a desired task. However, hidden layers also render algorithm outputs difficult to interpret. Here we introduce a novel biomarker activation map (BAM) framework based on generative adversarial learning that allows clinicians to verify and understand classifiers decision-making. A data set including 456 macular scans were graded as non-referable or referable DR based on current clinical standards. A DR classifier that was used to evaluate our BAM was first trained based on this data set. The BAM generation framework was designed by combing two U-shaped generators to provide meaningful interpretability to this classifier. The main generator was trained to take referable scans as input and produce an output that would be classified by the classifier as non-referable. The BAM is then constructed as the difference image between the output and input of the main generator. To ensure that the BAM only highlights classifier-utilized biomarkers an assistant generator was trained to do the opposite, producing scans that would be classified as referable by the classifier from non-referable scans. The generated BAMs highlighted known pathologic features including nonperfusion area and retinal fluid. A fully interpretable classifier based on these highlights could help clinicians better utilize and verify automated DR diagnosis. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.06299v3-abstract-full').style.display = 'none'; document.getElementById('2212.06299v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 December, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper has been accepted by IEEE TBME</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.0; I.4.0; J.3 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2211.12421">arXiv:2211.12421</a> <span> [<a href="https://arxiv.org/pdf/2211.12421">pdf</a>, <a href="https://arxiv.org/format/2211.12421">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Neurons and Cognition">q-bio.NC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Data-Driven Network Neuroscience: On Data Collection and Benchmark </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Xu%2C+J">Jiaxing Xu</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+Y">Yunhan Yang</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+D+T+J">David Tse Jung Huang</a>, <a href="/search/eess?searchtype=author&query=Gururajapathy%2C+S+S">Sophi Shilpa Gururajapathy</a>, <a href="/search/eess?searchtype=author&query=Ke%2C+Y">Yiping Ke</a>, <a href="/search/eess?searchtype=author&query=Qiao%2C+M">Miao Qiao</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+A">Alan Wang</a>, <a href="/search/eess?searchtype=author&query=Kumar%2C+H">Haribalan Kumar</a>, <a href="/search/eess?searchtype=author&query=McGeown%2C+J">Josh McGeown</a>, <a href="/search/eess?searchtype=author&query=Kwon%2C+E">Eryn Kwon</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2211.12421v6-abstract-short" style="display: inline;"> This paper presents a comprehensive and quality collection of functional human brain network data for potential research in the intersection of neuroscience, machine learning, and graph analytics. Anatomical and functional MRI images have been used to understand the functional connectivity of the human brain and are particularly important in identifying underlying neurodegenerative conditions such… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.12421v6-abstract-full').style.display = 'inline'; document.getElementById('2211.12421v6-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2211.12421v6-abstract-full" style="display: none;"> This paper presents a comprehensive and quality collection of functional human brain network data for potential research in the intersection of neuroscience, machine learning, and graph analytics. Anatomical and functional MRI images have been used to understand the functional connectivity of the human brain and are particularly important in identifying underlying neurodegenerative conditions such as Alzheimer's, Parkinson's, and Autism. Recently, the study of the brain in the form of brain networks using machine learning and graph analytics has become increasingly popular, especially to predict the early onset of these conditions. A brain network, represented as a graph, retains rich structural and positional information that traditional examination methods are unable to capture. However, the lack of publicly accessible brain network data prevents researchers from data-driven explorations. One of the main difficulties lies in the complicated domain-specific preprocessing steps and the exhaustive computation required to convert the data from MRI images into brain networks. We bridge this gap by collecting a large amount of MRI images from public databases and a private source, working with domain experts to make sensible design choices, and preprocessing the MRI images to produce a collection of brain network datasets. The datasets originate from 6 different sources, cover 4 brain conditions, and consist of a total of 2,702 subjects. We test our graph datasets on 12 machine learning models to provide baselines and validate the data quality on a recent graph analysis model. To lower the barrier to entry and promote the research in this interdisciplinary field, we release our brain network data and complete preprocessing details including codes at https://doi.org/10.17608/k6.auckland.21397377 and https://github.com/brainnetuoa/data_driven_network_neuroscience. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.12421v6-abstract-full').style.display = 'none'; document.getElementById('2211.12421v6-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Advances in Neural Information Processing Systems, 2023 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2210.04990">arXiv:2210.04990</a> <span> [<a href="https://arxiv.org/pdf/2210.04990">pdf</a>, <a href="https://arxiv.org/ps/2210.04990">ps</a>, <a href="https://arxiv.org/format/2210.04990">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Racial Disparities in Pulse Oximetry Cannot Be Fixed With Race-Based Correction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Patwari%2C+N">Neal Patwari</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+D">Di Huang</a>, <a href="/search/eess?searchtype=author&query=Bonetta-Misteli%2C+K">Kiki Bonetta-Misteli</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2210.04990v1-abstract-short" style="display: inline;"> Studies have shown pulse oximeter measurements of blood oxygenation have statistical bias that is a function of race, which results in higher rates of occult hypoxemia, i.e., missed detection of dangerously low oxygenation, in patients of color. This paper further characterizes the statistical distribution of pulse ox measurements, showing they also have a higher variance for patients racialized a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.04990v1-abstract-full').style.display = 'inline'; document.getElementById('2210.04990v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2210.04990v1-abstract-full" style="display: none;"> Studies have shown pulse oximeter measurements of blood oxygenation have statistical bias that is a function of race, which results in higher rates of occult hypoxemia, i.e., missed detection of dangerously low oxygenation, in patients of color. This paper further characterizes the statistical distribution of pulse ox measurements, showing they also have a higher variance for patients racialized as Black, compared to those racialized as white. We show that no single race-based correction factor will provide equal performance in the detection of hypoxemia. The results have implications for racially equitable pulse oximetry. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.04990v1-abstract-full').style.display = 'none'; document.getElementById('2210.04990v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">6 pages, originally submitted to IEEE SPMB 2022 on 1 July 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2207.06424">arXiv:2207.06424</a> <span> [<a href="https://arxiv.org/pdf/2207.06424">pdf</a>, <a href="https://arxiv.org/format/2207.06424">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Optimal control of dielectric elastomer actuated multibody dynamical systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Huang%2C+D">Dengpeng Huang</a>, <a href="/search/eess?searchtype=author&query=Leyendecker%2C+S">Sigrid Leyendecker</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2207.06424v1-abstract-short" style="display: inline;"> In this work, a simulation model for the optimal control of dielectric elastomer actuated flexible multibody dynamics systems is presented. The Dielectric Elastomer Actuator (DEA) behaves like a flexible artificial muscles in soft robotics. It is modeled as an electromechanically coupled geometrically exact beam, where the electric charges serve as control variables. The DEA-beam is integrated as… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2207.06424v1-abstract-full').style.display = 'inline'; document.getElementById('2207.06424v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2207.06424v1-abstract-full" style="display: none;"> In this work, a simulation model for the optimal control of dielectric elastomer actuated flexible multibody dynamics systems is presented. The Dielectric Elastomer Actuator (DEA) behaves like a flexible artificial muscles in soft robotics. It is modeled as an electromechanically coupled geometrically exact beam, where the electric charges serve as control variables. The DEA-beam is integrated as an actuator into multibody systems consisting of rigid and flexible components. The model also represents contact interaction via unilateral constraints between the beam actuator and e.g. a rigid body during the grasping process of a soft robot. Specifically for the DEA, a work conjugated electric displacement and strain-like electric variables are derived for the Cosserat beam. With a mathematically concise and physically representative formulation, a reduced free energy function is developed for the beam-DEA. In the optimal control problem, an objective function is minimized while the dynamic balance equations for the multibody system have to be fulfilled together with the complementarity conditions for the contact and boundary conditions. The optimal control problem is solved via a direct transcription method, transforming it into a constrained nonlinear optimization problem. The beam is firstly semidiscretized with 1D finite elements and then the multibody dynamics is temporally discretized with a variational integrator leading to the discrete Euler-Lagrange equations, which are further reduced with the null space projection. The discrete Euler-Lagrange equations and the boundary conditions serve as equality constraints, whereas the contact constraints are treated as inequality constraints in the optimization of the discretized objective. The effectiveness of the developed model is demonstrated by three numerical examples, including a cantilever beam, a soft robotic worm and a soft grasper. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2207.06424v1-abstract-full').style.display = 'none'; document.getElementById('2207.06424v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 July, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">22 pages, 11 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2206.02596">arXiv:2206.02596</a> <span> [<a href="https://arxiv.org/pdf/2206.02596">pdf</a>, <a href="https://arxiv.org/format/2206.02596">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> A Robust Deep Learning Enabled Semantic Communication System for Text </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Peng%2C+X">Xiang Peng</a>, <a href="/search/eess?searchtype=author&query=Qin%2C+Z">Zhijin Qin</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+D">Danlan Huang</a>, <a href="/search/eess?searchtype=author&query=Tao%2C+X">Xiaoming Tao</a>, <a href="/search/eess?searchtype=author&query=Lu%2C+J">Jianhua Lu</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+G">Guangyi Liu</a>, <a href="/search/eess?searchtype=author&query=Pan%2C+C">Chengkang Pan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2206.02596v1-abstract-short" style="display: inline;"> With the advent of the 6G era, the concept of semantic communication has attracted increasing attention. Compared with conventional communication systems, semantic communication systems are not only affected by physical noise existing in the wireless communication environment, e.g., additional white Gaussian noise, but also by semantic noise due to the source and the nature of deep learning-based… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.02596v1-abstract-full').style.display = 'inline'; document.getElementById('2206.02596v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2206.02596v1-abstract-full" style="display: none;"> With the advent of the 6G era, the concept of semantic communication has attracted increasing attention. Compared with conventional communication systems, semantic communication systems are not only affected by physical noise existing in the wireless communication environment, e.g., additional white Gaussian noise, but also by semantic noise due to the source and the nature of deep learning-based systems. In this paper, we elaborate on the mechanism of semantic noise. In particular, we categorize semantic noise into two categories: literal semantic noise and adversarial semantic noise. The former is caused by written errors or expression ambiguity, while the latter is caused by perturbations or attacks added to the embedding layer via the semantic channel. To prevent semantic noise from influencing semantic communication systems, we present a robust deep learning enabled semantic communication system (R-DeepSC) that leverages a calibrated self-attention mechanism and adversarial training to tackle semantic noise. Compared with baseline models that only consider physical noise for text transmission, the proposed R-DeepSC achieves remarkable performance in dealing with semantic noise under different signal-to-noise ratios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.02596v1-abstract-full').style.display = 'none'; document.getElementById('2206.02596v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 June, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">6 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2205.03214">arXiv:2205.03214</a> <span> [<a href="https://arxiv.org/pdf/2205.03214">pdf</a>, <a href="https://arxiv.org/format/2205.03214">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Modularized Bilinear Koopman Operator for Modeling and Predicting Transients of Microgrids </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Jiang%2C+X">Xinyuan Jiang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yan Li</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+D">Daning Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2205.03214v2-abstract-short" style="display: inline;"> Modularized Koopman Bilinear Form (M-KBF) is presented to model and predict the transient dynamics of microgrids in the presence of disturbances. As a scalable data-driven approach, M-KBF divides the identification and prediction of the high-dimensional nonlinear system into the individual study of subsystems; and thus, alleviating the difficulty of intensively handling high volume data and overco… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2205.03214v2-abstract-full').style.display = 'inline'; document.getElementById('2205.03214v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2205.03214v2-abstract-full" style="display: none;"> Modularized Koopman Bilinear Form (M-KBF) is presented to model and predict the transient dynamics of microgrids in the presence of disturbances. As a scalable data-driven approach, M-KBF divides the identification and prediction of the high-dimensional nonlinear system into the individual study of subsystems; and thus, alleviating the difficulty of intensively handling high volume data and overcoming the curse of dimensionality. For each subsystem, Koopman bilinear form is applied to efficiently identify its model by developing eigenfunctions via the extended dynamic mode decomposition method with an eigenvalue-based order truncation. Extensive tests show that M-KBF can provide accurate transient dynamics prediction for the nonlinear microgrids and verify the plug-and-play modeling and prediction function, which offers a potent tool for identifying high-dimensional systems. The modularity feature of M-KBF enables the provision of fast and precise prediction for the microgrid operation and control, paving the way towards online applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2205.03214v2-abstract-full').style.display = 'none'; document.getElementById('2205.03214v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 May, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 May, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2204.08557">arXiv:2204.08557</a> <span> [<a href="https://arxiv.org/pdf/2204.08557">pdf</a>, <a href="https://arxiv.org/format/2204.08557">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> PIDGeuN: Graph Neural Network-Enabled Transient Dynamics Prediction of Networked Microgrids Through Full-Field Measurement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yu%2C+Y">Yin Yu</a>, <a href="/search/eess?searchtype=author&query=Jiang%2C+X">Xinyuan Jiang</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+D">Daning Huang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yan Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2204.08557v1-abstract-short" style="display: inline;"> A Physics-Informed Dynamic Graph Neural Network (PIDGeuN) is presented to accurately, efficiently and robustly predict the nonlinear transient dynamics of microgrids in the presence of disturbances. The graph-based architecture of PIDGeuN provides a natural representation of the microgrid topology. Using only the state information that is practically measurable, PIDGeuN employs a time delay embedd… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2204.08557v1-abstract-full').style.display = 'inline'; document.getElementById('2204.08557v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2204.08557v1-abstract-full" style="display: none;"> A Physics-Informed Dynamic Graph Neural Network (PIDGeuN) is presented to accurately, efficiently and robustly predict the nonlinear transient dynamics of microgrids in the presence of disturbances. The graph-based architecture of PIDGeuN provides a natural representation of the microgrid topology. Using only the state information that is practically measurable, PIDGeuN employs a time delay embedding formulation to fully reproduce the system dynamics, avoiding the dependency of conventional methods on internal dynamic states such as controllers. Based on a judiciously designed message passing mechanism, the PIDGeuN incorporates two physics-informed techniques to improve its prediction performance, including a physics-data-infusion approach to determining the inter-dependencies between buses, and a loss term to respect the known physical law of the power system, i.e., the Kirchhoff's law, to ensure the feasibility of the model prediction. Extensive tests show that PIDGeuN can provide accurate and robust prediction of transient dynamics for nonlinear microgrids over a long-term time period. Therefore, the PIDGeuN offers a potent tool for the modeling of large scale networked microgrids (NMs), with potential applications to predictive or preventive control in real time applications for the stable and resilient operations of NMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2204.08557v1-abstract-full').style.display = 'none'; document.getElementById('2204.08557v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 April, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper is currently under review for a journal</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2112.06657">arXiv:2112.06657</a> <span> [<a href="https://arxiv.org/pdf/2112.06657">pdf</a>, <a href="https://arxiv.org/format/2112.06657">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> You Can Wash Hands Better: Accurate Daily Handwashing Assessment with Smartwatches </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+F">Fei Wang</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+X">Xilei Wu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+T">Tingting Zhang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+X">Xin Wang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+P">Pengcheng Wang</a>, <a href="/search/eess?searchtype=author&query=Ding%2C+H">Han Ding</a>, <a href="/search/eess?searchtype=author&query=Shi%2C+J">Jingang Shi</a>, <a href="/search/eess?searchtype=author&query=Han%2C+J">Jinsong Han</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+D">Dong Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2112.06657v3-abstract-short" style="display: inline;"> Hand hygiene is among the most effective daily practices for preventing infectious diseases such as influenza, malaria, and skin infections. While professional guidelines emphasize proper handwashing to reduce the risk of viral infections, surveys reveal that adherence to these recommendations remains low. To address this gap, we propose UWash, a wearable solution leveraging smartwatches to evalua… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2112.06657v3-abstract-full').style.display = 'inline'; document.getElementById('2112.06657v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2112.06657v3-abstract-full" style="display: none;"> Hand hygiene is among the most effective daily practices for preventing infectious diseases such as influenza, malaria, and skin infections. While professional guidelines emphasize proper handwashing to reduce the risk of viral infections, surveys reveal that adherence to these recommendations remains low. To address this gap, we propose UWash, a wearable solution leveraging smartwatches to evaluate handwashing procedures, aiming to raise awareness and cultivate high-quality handwashing habits. We frame the task of handwashing assessment as an action segmentation problem, similar to those in computer vision, and introduce a simple yet efficient two-stream UNet-like network to achieve this goal. Experiments involving 51 subjects demonstrate that UWash achieves 92.27% accuracy in handwashing gesture recognition, an error of <0.5 seconds in onset/offset detection, and an error of <5 points in gesture scoring under user-dependent settings. The system also performs robustly in user-independent and user-independent-location-independent evaluations. Remarkably, UWash maintains high performance in real-world tests, including evaluations with 10 random passersby at a hospital 9 months later and 10 passersby in an in-the-wild test conducted 2 years later. UWash is the first system to score handwashing quality based on gesture sequences, offering actionable guidance for improving daily hand hygiene. The code and dataset are publicly available at \url{https://github.com/aiotgroup/UWash}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2112.06657v3-abstract-full').style.display = 'none'; document.getElementById('2112.06657v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 December, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Under review. 13 pages, 12 figures, 7 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2110.06465">arXiv:2110.06465</a> <span> [<a href="https://arxiv.org/pdf/2110.06465">pdf</a>, <a href="https://arxiv.org/format/2110.06465">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Breaking the Dilemma of Medical Image-to-image Translation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Kong%2C+L">Lingke Kong</a>, <a href="/search/eess?searchtype=author&query=Lian%2C+C">Chenyu Lian</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+D">Detian Huang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Z">Zhenjiang Li</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+Y">Yanle Hu</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+Q">Qichao Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2110.06465v2-abstract-short" style="display: inline;"> Supervised Pix2Pix and unsupervised Cycle-consistency are two modes that dominate the field of medical image-to-image translation. However, neither modes are ideal. The Pix2Pix mode has excellent performance. But it requires paired and well pixel-wise aligned images, which may not always be achievable due to respiratory motion or anatomy change between times that paired images are acquired. The Cy… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.06465v2-abstract-full').style.display = 'inline'; document.getElementById('2110.06465v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2110.06465v2-abstract-full" style="display: none;"> Supervised Pix2Pix and unsupervised Cycle-consistency are two modes that dominate the field of medical image-to-image translation. However, neither modes are ideal. The Pix2Pix mode has excellent performance. But it requires paired and well pixel-wise aligned images, which may not always be achievable due to respiratory motion or anatomy change between times that paired images are acquired. The Cycle-consistency mode is less stringent with training data and works well on unpaired or misaligned images. But its performance may not be optimal. In order to break the dilemma of the existing modes, we propose a new unsupervised mode called RegGAN for medical image-to-image translation. It is based on the theory of "loss-correction". In RegGAN, the misaligned target images are considered as noisy labels and the generator is trained with an additional registration network to fit the misaligned noise distribution adaptively. The goal is to search for the common optimal solution to both image-to-image translation and registration tasks. We incorporated RegGAN into a few state-of-the-art image-to-image translation methods and demonstrated that RegGAN could be easily combined with these methods to improve their performances. Such as a simple CycleGAN in our mode surpasses latest NICEGAN even though using less network parameters. Based on our results, RegGAN outperformed both Pix2Pix on aligned data and Cycle-consistency on misaligned or unpaired data. RegGAN is insensitive to noises which makes it a better choice for a wide range of scenarios, especially for medical image-to-image translation tasks in which well pixel-wise aligned data are not available <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.06465v2-abstract-full').style.display = 'none'; document.getElementById('2110.06465v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 November, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 October, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2110.04444">arXiv:2110.04444</a> <span> [<a href="https://arxiv.org/pdf/2110.04444">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Sensoring and Application of Multimodal Data for the Detection of Freezing of Gait in Parkinson's Disease </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+W">Wei Zhang</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+D">Debin Huang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+H">Hantao Li</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+L">Lipeng Wang</a>, <a href="/search/eess?searchtype=author&query=Wei%2C+Y">Yanzhao Wei</a>, <a href="/search/eess?searchtype=author&query=Pan%2C+K">Kang Pan</a>, <a href="/search/eess?searchtype=author&query=Ma%2C+L">Lin Ma</a>, <a href="/search/eess?searchtype=author&query=Feng%2C+H">Huanhuan Feng</a>, <a href="/search/eess?searchtype=author&query=Pan%2C+J">Jing Pan</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+Y">Yuzhu Guo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2110.04444v1-abstract-short" style="display: inline;"> The accurate and reliable detection or prediction of freezing of gaits (FOG) is important for fall prevention in Parkinson's Disease (PD) and studying the physiological transitions during the occurrence of FOG. Integrating both commercial and self-designed sensors, a protocal has been designed to acquire multimodal physical and physiological information during FOG, including gait acceleration (ACC… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.04444v1-abstract-full').style.display = 'inline'; document.getElementById('2110.04444v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2110.04444v1-abstract-full" style="display: none;"> The accurate and reliable detection or prediction of freezing of gaits (FOG) is important for fall prevention in Parkinson's Disease (PD) and studying the physiological transitions during the occurrence of FOG. Integrating both commercial and self-designed sensors, a protocal has been designed to acquire multimodal physical and physiological information during FOG, including gait acceleration (ACC), electroencephalogram (EEG), electromyogram (EMG), and skin conductance (SC). Two tasks were designed to trigger FOG, including gait initiation failure and FOG during walking. A total number of 12 PD patients completed the experiments and produced a total length of 3 hours and 42 minutes of valid data. The FOG episodes were labeled by two qualified physicians. Each unimodal data and combinations have been used to detect FOG. Results showed that multimodal data benefit the detection of FOG. Among unimodal data, EEG had better discriminative ability than ACC and EMG. However, the acquisition of EEG are more complicated. Multimodal motional and electrophysiological data can also be used to study the physiological transition process during the occurrence of FOG and provide personalised interventions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.04444v1-abstract-full').style.display = 'none'; document.getElementById('2110.04444v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 October, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper has 13 pages and 8 figures. The data was published on Mendeley Data, where raw data availible at https://data.mendeley.com/datasets/t8j8v4hnm4/1 and filtered data availible at https://data.mendeley.com/datasets/r8gmbtv7w2/3</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2109.08007">arXiv:2109.08007</a> <span> [<a href="https://arxiv.org/pdf/2109.08007">pdf</a>, <a href="https://arxiv.org/format/2109.08007">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Graph Fourier Transform based Audio Zero-watermarking </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Xu%2C+L">Longting Xu</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+D">Daiyu Huang</a>, <a href="/search/eess?searchtype=author&query=Zaidi%2C+S+F+A">Syed Faham Ali Zaidi</a>, <a href="/search/eess?searchtype=author&query=Rauf%2C+A">Abdul Rauf</a>, <a href="/search/eess?searchtype=author&query=Das%2C+R+K">Rohan Kumar Das</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2109.08007v1-abstract-short" style="display: inline;"> The frequent exchange of multimedia information in the present era projects an increasing demand for copyright protection. In this work, we propose a novel audio zero-watermarking technology based on graph Fourier transform for enhancing the robustness with respect to copyright protection. In this approach, the combined shift operator is used to construct the graph signal, upon which the graph Fou… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2109.08007v1-abstract-full').style.display = 'inline'; document.getElementById('2109.08007v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2109.08007v1-abstract-full" style="display: none;"> The frequent exchange of multimedia information in the present era projects an increasing demand for copyright protection. In this work, we propose a novel audio zero-watermarking technology based on graph Fourier transform for enhancing the robustness with respect to copyright protection. In this approach, the combined shift operator is used to construct the graph signal, upon which the graph Fourier analysis is performed. The selected maximum absolute graph Fourier coefficients representing the characteristics of the audio segment are then encoded into a feature binary sequence using K-means algorithm. Finally, the resultant feature binary sequence is XOR-ed with the watermark binary sequence to realize the embedding of the zero-watermarking. The experimental studies show that the proposed approach performs more effectively in resisting common or synchronization attacks than the existing state-of-the-art methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2109.08007v1-abstract-full').style.display = 'none'; document.getElementById('2109.08007v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 September, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2104.14975">arXiv:2104.14975</a> <span> [<a href="https://arxiv.org/pdf/2104.14975">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Intelligent Decision Method for Main Control Parameters of Tunnel Boring Machine based on Multi-Objective Optimization of Excavation Efficiency and Cost </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Liu%2C+B">Bin Liu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yaxu Wang</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+G">Guangzu Zhao</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+B">Bin Yang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+R">Ruirui Wang</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+D">Dexiang Huang</a>, <a href="/search/eess?searchtype=author&query=Xiang%2C+B">Bin Xiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2104.14975v1-abstract-short" style="display: inline;"> Timely and reasonable matching of the control parameters and geological conditions of the rock mass in tunnel excavation is crucial for hard rock tunnel boring machines (TBMs). Therefore, this paper proposes an intelligent decision method for the main control parameters of the TBM based on the multi-objective optimization of excavation efficiency and cost. The main objectives of this method are to… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2104.14975v1-abstract-full').style.display = 'inline'; document.getElementById('2104.14975v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2104.14975v1-abstract-full" style="display: none;"> Timely and reasonable matching of the control parameters and geological conditions of the rock mass in tunnel excavation is crucial for hard rock tunnel boring machines (TBMs). Therefore, this paper proposes an intelligent decision method for the main control parameters of the TBM based on the multi-objective optimization of excavation efficiency and cost. The main objectives of this method are to obtain the most important parameters of the rock mass and machine, determine the optimization objective, and establish the objective function. In this study, muck information was included as an important parameter in the traditional rock mass and machine parameter database. The rock-machine interaction model was established through an improved neural network algorithm. Using 250 sets of data collected in the field, the validity of the rock-machine interaction relationship model was verified. Then, taking the cost as the optimization objective, the cost calculation model related to tunneling and the cutter was obtained. Subsequently, combined with rock-machine interaction model, the objective function of control parameter optimization based on cost was established. Finally, a tunneling test was carried out at the engineering site, and the main TBM control parameters (thrust and torque) after the optimization decision were used to excavate the test section. Compared with the values in the section where the TBM operators relied on experience, the average penetration rate of the TBM increased by 11.10%, and the average cutter life increased by 15.62%. The results indicate that this method can play an effective role in TBM tunneling in the test section. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2104.14975v1-abstract-full').style.display = 'none'; document.getElementById('2104.14975v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 April, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">29 pages, 10 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2103.11906">arXiv:2103.11906</a> <span> [<a href="https://arxiv.org/pdf/2103.11906">pdf</a>, <a href="https://arxiv.org/format/2103.11906">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TMTT.2021.3103209">10.1109/TMTT.2021.3103209 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> DC-Assisted Stabilization of Internal Oscillations for Improved Symbol Transitions in a Direct Antenna Modulation Transmitter </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Huang%2C+D">Danyang Huang</a>, <a href="/search/eess?searchtype=author&query=Schab%2C+K">Kurt Schab</a>, <a href="/search/eess?searchtype=author&query=Dusenbury%2C+J">Joseph Dusenbury</a>, <a href="/search/eess?searchtype=author&query=Sluss%2C+B">Brandon Sluss</a>, <a href="/search/eess?searchtype=author&query=Adams%2C+J">Jacob Adams</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2103.11906v2-abstract-short" style="display: inline;"> Internal oscillations in switched antenna transmitters cause undesirable fluctuations of the stored energy in the system, reducing the effectiveness of time-varying broadbanding methods, such as energy-synchronous direct antenna modulation. To mitigate these parasitic oscillations, a modified direct antenna modulation system with an auxiliary DC source is introduced to stabilize energy storage on… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2103.11906v2-abstract-full').style.display = 'inline'; document.getElementById('2103.11906v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2103.11906v2-abstract-full" style="display: none;"> Internal oscillations in switched antenna transmitters cause undesirable fluctuations of the stored energy in the system, reducing the effectiveness of time-varying broadbanding methods, such as energy-synchronous direct antenna modulation. To mitigate these parasitic oscillations, a modified direct antenna modulation system with an auxiliary DC source is introduced to stabilize energy storage on the antenna. A detailed circuit model for a direct antenna modulation system is used to identify the origin of the oscillations and to justify the selection of the DC source. Measured phase shift keyed waveforms transmitted by the modified system show significant increases in signal fidelity, including a 10-20 dB reduction in error vector magnitude compared to a time-invariant system. Comparison to an equivalent, scalable time-invariant antenna suggests that the switched transmitter behaves as though it has 2-3 times lower radiation Q-factor and 20% higher radiation efficiency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2103.11906v2-abstract-full').style.display = 'none'; document.getElementById('2103.11906v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 August, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 February, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2011.02198">arXiv:2011.02198</a> <span> [<a href="https://arxiv.org/pdf/2011.02198">pdf</a>, <a href="https://arxiv.org/format/2011.02198">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> IEEE SLT 2021 Alpha-mini Speech Challenge: Open Datasets, Tracks, Rules and Baselines </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Fu%2C+Y">Yihui Fu</a>, <a href="/search/eess?searchtype=author&query=Yao%2C+Z">Zhuoyuan Yao</a>, <a href="/search/eess?searchtype=author&query=He%2C+W">Weipeng He</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+J">Jian Wu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+X">Xiong Wang</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+Z">Zhanheng Yang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+S">Shimin Zhang</a>, <a href="/search/eess?searchtype=author&query=Xie%2C+L">Lei Xie</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+D">Dongyan Huang</a>, <a href="/search/eess?searchtype=author&query=Bu%2C+H">Hui Bu</a>, <a href="/search/eess?searchtype=author&query=Motlicek%2C+P">Petr Motlicek</a>, <a href="/search/eess?searchtype=author&query=Odobez%2C+J">Jean-Marc Odobez</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2011.02198v2-abstract-short" style="display: inline;"> The IEEE Spoken Language Technology Workshop (SLT) 2021 Alpha-mini Speech Challenge (ASC) is intended to improve research on keyword spotting (KWS) and sound source location (SSL) on humanoid robots. Many publications report significant improvements in deep learning based KWS and SSL on open source datasets in recent years. For deep learning model training, it is necessary to expand the data cover… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2011.02198v2-abstract-full').style.display = 'inline'; document.getElementById('2011.02198v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2011.02198v2-abstract-full" style="display: none;"> The IEEE Spoken Language Technology Workshop (SLT) 2021 Alpha-mini Speech Challenge (ASC) is intended to improve research on keyword spotting (KWS) and sound source location (SSL) on humanoid robots. Many publications report significant improvements in deep learning based KWS and SSL on open source datasets in recent years. For deep learning model training, it is necessary to expand the data coverage to improve the robustness of model. Thus, simulating multi-channel noisy and reverberant data from single-channel speech, noise, echo and room impulsive response (RIR) is widely adopted. However, this approach may generate mismatch between simulated data and recorded data in real application scenarios, especially echo data. In this challenge, we open source a sizable speech, keyword, echo and noise corpus for promoting data-driven methods, particularly deep-learning approaches on KWS and SSL. We also choose Alpha-mini, a humanoid robot produced by UBTECH equipped with a built-in four-microphone array on its head, to record development and evaluation sets under the actual Alpha-mini robot application scenario, including noise as well as echo and mechanical noise generated by the robot itself for model evaluation. Furthermore, we illustrate the rules, evaluation methods and baselines for researchers to quickly assess their achievements and optimize their models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2011.02198v2-abstract-full').style.display = 'none'; document.getElementById('2011.02198v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 November, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at IEEE SLT 2021</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2008.00820">arXiv:2008.00820</a> <span> [<a href="https://arxiv.org/pdf/2008.00820">pdf</a>, <a href="https://arxiv.org/format/2008.00820">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TIP.2020.3009820">10.1109/TIP.2020.3009820 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Generating Visually Aligned Sound from Videos </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Chen%2C+P">Peihao Chen</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Y">Yang Zhang</a>, <a href="/search/eess?searchtype=author&query=Tan%2C+M">Mingkui Tan</a>, <a href="/search/eess?searchtype=author&query=Xiao%2C+H">Hongdong Xiao</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+D">Deng Huang</a>, <a href="/search/eess?searchtype=author&query=Gan%2C+C">Chuang Gan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2008.00820v1-abstract-short" style="display: inline;"> We focus on the task of generating sound from natural videos, and the sound should be both temporally and content-wise aligned with visual signals. This task is extremely challenging because some sounds generated \emph{outside} a camera can not be inferred from video content. The model may be forced to learn an incorrect mapping between visual content and these irrelevant sounds. To address this c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2008.00820v1-abstract-full').style.display = 'inline'; document.getElementById('2008.00820v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2008.00820v1-abstract-full" style="display: none;"> We focus on the task of generating sound from natural videos, and the sound should be both temporally and content-wise aligned with visual signals. This task is extremely challenging because some sounds generated \emph{outside} a camera can not be inferred from video content. The model may be forced to learn an incorrect mapping between visual content and these irrelevant sounds. To address this challenge, we propose a framework named REGNET. In this framework, we first extract appearance and motion features from video frames to better distinguish the object that emits sound from complex background information. We then introduce an innovative audio forwarding regularizer that directly considers the real sound as input and outputs bottlenecked sound features. Using both visual and bottlenecked sound features for sound prediction during training provides stronger supervision for the sound prediction. The audio forwarding regularizer can control the irrelevant sound component and thus prevent the model from learning an incorrect mapping between video frames and sound emitted by the object that is out of the screen. During testing, the audio forwarding regularizer is removed to ensure that REGNET can produce purely aligned sound only from visual features. Extensive evaluations based on Amazon Mechanical Turk demonstrate that our method significantly improves both temporal and content-wise alignment. Remarkably, our generated sound can fool the human with a 68.12% success rate. Code and pre-trained models are publicly available at https://github.com/PeihaoChen/regnet <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2008.00820v1-abstract-full').style.display = 'none'; document.getElementById('2008.00820v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 July, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published in IEEE Transactions on Image Processing, 2020. Code, pre-trained models and demo video: https://github.com/PeihaoChen/regnet</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2007.10984">arXiv:2007.10984</a> <span> [<a href="https://arxiv.org/pdf/2007.10984">pdf</a>, <a href="https://arxiv.org/format/2007.10984">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Foley Music: Learning to Generate Music from Videos </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Gan%2C+C">Chuang Gan</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+D">Deng Huang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+P">Peihao Chen</a>, <a href="/search/eess?searchtype=author&query=Tenenbaum%2C+J+B">Joshua B. Tenenbaum</a>, <a href="/search/eess?searchtype=author&query=Torralba%2C+A">Antonio Torralba</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2007.10984v1-abstract-short" style="display: inline;"> In this paper, we introduce Foley Music, a system that can synthesize plausible music for a silent video clip about people playing musical instruments. We first identify two key intermediate representations for a successful video to music generator: body keypoints from videos and MIDI events from audio recordings. We then formulate music generation from videos as a motion-to-MIDI translation probl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2007.10984v1-abstract-full').style.display = 'inline'; document.getElementById('2007.10984v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2007.10984v1-abstract-full" style="display: none;"> In this paper, we introduce Foley Music, a system that can synthesize plausible music for a silent video clip about people playing musical instruments. We first identify two key intermediate representations for a successful video to music generator: body keypoints from videos and MIDI events from audio recordings. We then formulate music generation from videos as a motion-to-MIDI translation problem. We present a Graph$-$Transformer framework that can accurately predict MIDI event sequences in accordance with the body movements. The MIDI event can then be converted to realistic music using an off-the-shelf music synthesizer tool. We demonstrate the effectiveness of our models on videos containing a variety of music performances. Experimental results show that our model outperforms several existing systems in generating music that is pleasant to listen to. More importantly, the MIDI representations are fully interpretable and transparent, thus enabling us to perform music editing flexibly. We encourage the readers to watch the demo video with audio turned on to experience the results. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2007.10984v1-abstract-full').style.display = 'none'; document.getElementById('2007.10984v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 July, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ECCV 2020. Project page: http://foley-music.csail.mit.edu</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2006.13522">arXiv:2006.13522</a> <span> [<a href="https://arxiv.org/pdf/2006.13522">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> </div> </div> <p class="title is-5 mathjax"> Focal Loss Analysis of Nerve Fiber Layer Reflectance for Glaucoma Diagnosis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Tan%2C+O">Ou Tan</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+L">Liang Liu</a>, <a href="/search/eess?searchtype=author&query=You%2C+Q">Qisheng You</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+J">Jie Wang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+A">Aiyin Chen</a>, <a href="/search/eess?searchtype=author&query=Ing%2C+E">Eliesa Ing</a>, <a href="/search/eess?searchtype=author&query=Morrison%2C+J+C">John C. Morrison</a>, <a href="/search/eess?searchtype=author&query=Jia%2C+Y">Yali Jia</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+D">David Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2006.13522v1-abstract-short" style="display: inline;"> Purpose: To evaluate nerve fiber layer (NFL) reflectance for glaucoma diagnosis. Methods: Participants were imaged with 4.5X4.5-mm volumetric disc scans using spectral-domain optical coherence tomography (OCT). The normalized NFL reflectance map was processed by an azimuthal filter to reduce directional reflectance bias due to variation of beam incidence angle. The peripapillary area of the map wa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2006.13522v1-abstract-full').style.display = 'inline'; document.getElementById('2006.13522v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2006.13522v1-abstract-full" style="display: none;"> Purpose: To evaluate nerve fiber layer (NFL) reflectance for glaucoma diagnosis. Methods: Participants were imaged with 4.5X4.5-mm volumetric disc scans using spectral-domain optical coherence tomography (OCT). The normalized NFL reflectance map was processed by an azimuthal filter to reduce directional reflectance bias due to variation of beam incidence angle. The peripapillary area of the map was divided into 160 superpixels. Average reflectance was the mean of superpixel reflectance. Low-reflectance superpixels were identified as those with NFL reflectance below the 5 percentile normative cutoff. Focal reflectance loss was measure by summing loss in low-reflectance superpixels. Results: Thirty-five normal, 30 pre-perimetric and 35 perimetric glaucoma participants were enrolled. Azimuthal filtering improved the repeatability of the normalized NFL reflectance, as measured by the pooled superpixel standard deviation (SD), from 0.73 to 0.57 dB (p<0.001, paired t-test) and reduced the population SD from 2.14 to 1.78 dB (p<0.001, t-test). Most glaucomatous reflectance maps showed characteristic patterns of contiguous wedge or diffuse defects. Focal NFL reflectance loss had significantly higher diagnostic sensitivity than the best NFL thickness parameter (overall, inferior, or focal loss volume): 53% v. 23% (p=0.027) in PPG eyes and 100% v. 80% (p=0.023) in PG eyes, with the specificity fixed at 99%. Conclusions: Azimuthal filtering reduces the variability of NFL reflectance measurements. Focal NFL reflectance loss has excellent glaucoma diagnostic accuracy compared to the standard NFL thickness parameters. The reflectance map may be useful for localizing NFL defects. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2006.13522v1-abstract-full').style.display = 'none'; document.getElementById('2006.13522v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 June, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">pages: 31; Tables: 6; Figures: 9</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2004.09476">arXiv:2004.09476</a> <span> [<a href="https://arxiv.org/pdf/2004.09476">pdf</a>, <a href="https://arxiv.org/format/2004.09476">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Music Gesture for Visual Sound Separation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Gan%2C+C">Chuang Gan</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+D">Deng Huang</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+H">Hang Zhao</a>, <a href="/search/eess?searchtype=author&query=Tenenbaum%2C+J+B">Joshua B. Tenenbaum</a>, <a href="/search/eess?searchtype=author&query=Torralba%2C+A">Antonio Torralba</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2004.09476v1-abstract-short" style="display: inline;"> Recent deep learning approaches have achieved impressive performance on visual sound separation tasks. However, these approaches are mostly built on appearance and optical flow like motion feature representations, which exhibit limited abilities to find the correlations between audio signals and visual points, especially when separating multiple instruments of the same types, such as multiple viol… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2004.09476v1-abstract-full').style.display = 'inline'; document.getElementById('2004.09476v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2004.09476v1-abstract-full" style="display: none;"> Recent deep learning approaches have achieved impressive performance on visual sound separation tasks. However, these approaches are mostly built on appearance and optical flow like motion feature representations, which exhibit limited abilities to find the correlations between audio signals and visual points, especially when separating multiple instruments of the same types, such as multiple violins in a scene. To address this, we propose "Music Gesture," a keypoint-based structured representation to explicitly model the body and finger movements of musicians when they perform music. We first adopt a context-aware graph network to integrate visual semantic context with body dynamics, and then apply an audio-visual fusion model to associate body movements with the corresponding audio signals. Experimental results on three music performance datasets show: 1) strong improvements upon benchmark metrics for hetero-musical separation tasks (i.e. different instruments); 2) new ability for effective homo-musical separation for piano, flute, and trumpet duets, which to our best knowledge has never been achieved with alternative methods. Project page: http://music-gesture.csail.mit.edu. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2004.09476v1-abstract-full').style.display = 'none'; document.getElementById('2004.09476v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 April, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CVPR 2020. Project page: http://music-gesture.csail.mit.edu</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2002.00552">arXiv:2002.00552</a> <span> [<a href="https://arxiv.org/pdf/2002.00552">pdf</a>, <a href="https://arxiv.org/format/2002.00552">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> DWM: A Decomposable Winograd Method for Convolution Acceleration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Huang%2C+D">Di Huang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+X">Xishan Zhang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+R">Rui Zhang</a>, <a href="/search/eess?searchtype=author&query=Zhi%2C+T">Tian Zhi</a>, <a href="/search/eess?searchtype=author&query=He%2C+D">Deyuan He</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+J">Jiaming Guo</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+C">Chang Liu</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+Q">Qi Guo</a>, <a href="/search/eess?searchtype=author&query=Du%2C+Z">Zidong Du</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+S">Shaoli Liu</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+T">Tianshi Chen</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yunji Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2002.00552v1-abstract-short" style="display: inline;"> Winograd's minimal filtering algorithm has been widely used in Convolutional Neural Networks (CNNs) to reduce the number of multiplications for faster processing. However, it is only effective on convolutions with kernel size as 3x3 and stride as 1, because it suffers from significantly increased FLOPs and numerical accuracy problem for kernel size larger than 3x3 and fails on convolution with str… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2002.00552v1-abstract-full').style.display = 'inline'; document.getElementById('2002.00552v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2002.00552v1-abstract-full" style="display: none;"> Winograd's minimal filtering algorithm has been widely used in Convolutional Neural Networks (CNNs) to reduce the number of multiplications for faster processing. However, it is only effective on convolutions with kernel size as 3x3 and stride as 1, because it suffers from significantly increased FLOPs and numerical accuracy problem for kernel size larger than 3x3 and fails on convolution with stride larger than 1. In this paper, we propose a novel Decomposable Winograd Method (DWM), which breaks through the limitation of original Winograd's minimal filtering algorithm to a wide and general convolutions. DWM decomposes kernels with large size or large stride to several small kernels with stride as 1 for further applying Winograd method, so that DWM can reduce the number of multiplications while keeping the numerical accuracy. It enables the fast exploring of larger kernel size and larger stride value in CNNs for high performance and accuracy and even the potential for new CNNs. Comparing against the original Winograd, the proposed DWM is able to support all kinds of convolutions with a speedup of ~2, without affecting the numerical accuracy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2002.00552v1-abstract-full').style.display = 'none'; document.getElementById('2002.00552v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 February, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by AAAI 2020</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1912.01167">arXiv:1912.01167</a> <span> [<a href="https://arxiv.org/pdf/1912.01167">pdf</a>, <a href="https://arxiv.org/format/1912.01167">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> High-quality Speech Synthesis Using Super-resolution Mel-Spectrogram </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Sheng%2C+L">Leyuan Sheng</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+D">Dong-Yan Huang</a>, <a href="/search/eess?searchtype=author&query=Pavlovskiy%2C+E+N">Evgeniy N. Pavlovskiy</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1912.01167v1-abstract-short" style="display: inline;"> In speech synthesis and speech enhancement systems, melspectrograms need to be precise in acoustic representations. However, the generated spectrograms are over-smooth, that could not produce high quality synthesized speech. Inspired by image-to-image translation, we address this problem by using a learning-based post filter combining Pix2PixHD and ResUnet to reconstruct the mel-spectrograms toget… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1912.01167v1-abstract-full').style.display = 'inline'; document.getElementById('1912.01167v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1912.01167v1-abstract-full" style="display: none;"> In speech synthesis and speech enhancement systems, melspectrograms need to be precise in acoustic representations. However, the generated spectrograms are over-smooth, that could not produce high quality synthesized speech. Inspired by image-to-image translation, we address this problem by using a learning-based post filter combining Pix2PixHD and ResUnet to reconstruct the mel-spectrograms together with super-resolution. From the resulting super-resolution spectrogram networks, we can generate enhanced spectrograms to produce high quality synthesized speech. Our proposed model achieves improved mean opinion scores (MOS) of 3.71 and 4.01 over baseline results of 3.29 and 3.84, while using vocoder Griffin-Lim and WaveNet, respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1912.01167v1-abstract-full').style.display = 'none'; document.getElementById('1912.01167v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 December, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2019. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1906.08673">arXiv:1906.08673</a> <span> [<a href="https://arxiv.org/pdf/1906.08673">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> Enhancement of Underwater Images with Statistical Model of Background Light and Optimization of Transmission Map </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Song%2C+W">Wei Song</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yan Wang</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+D">Dongmei Huang</a>, <a href="/search/eess?searchtype=author&query=Liotta%2C+A">Antonio Liotta</a>, <a href="/search/eess?searchtype=author&query=Perra%2C+C">Cristian Perra</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1906.08673v1-abstract-short" style="display: inline;"> Underwater images often have severe quality degradation and distortion due to light absorption and scattering in the water medium. A hazed image formation model is widely used to restore the image quality. It depends on two optical parameters: the background light and the transmission map. Underwater images can also be enhanced by color and contrast correction from the perspective of image process… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1906.08673v1-abstract-full').style.display = 'inline'; document.getElementById('1906.08673v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1906.08673v1-abstract-full" style="display: none;"> Underwater images often have severe quality degradation and distortion due to light absorption and scattering in the water medium. A hazed image formation model is widely used to restore the image quality. It depends on two optical parameters: the background light and the transmission map. Underwater images can also be enhanced by color and contrast correction from the perspective of image processing. In this paper, we propose an effective underwater image enhancement method for underwater images in composition of underwater image restoration and color correction. Firstly, a manually annotated background lights (MABLs) database is developed. With reference to the relationship between MABLs and the histogram distributions of various underwater images, robust statistical models of BLs estimation are provided. Next, the TM of R channel is roughly estimated based on the new underwater dark channel prior via the statistic of clear and high resolution underwater images, then a scene depth map based on the underwater light attenuation prior and an adjusted reversed saturation map are applied to compensate and modify the coarse TM of R channel. Next, TMs of G-B channels are estimated based on the difference of attenuation ratios between R channel and G-B channels. Finally, to improve the color and contrast of the restored image with a natural appearance, a variation of white balance is introduced as post-processing. In order to guide the priority of underwater image enhancement, sufficient evaluations are conducted to discuss the impacts of the key parameters including BL and TM, and the importance of the color correction. Comparisons with other state-of-the-art methods demonstrate that our proposed underwater image enhancement method can achieve higher accuracy of estimated BLs, less computation time, more superior performance, and more valuable information retention. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1906.08673v1-abstract-full').style.display = 'none'; document.getElementById('1906.08673v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 June, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Huang%2C+D&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Huang%2C+D&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Huang%2C+D&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository