CINXE.COM

Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1&ndash;46 of 46 results for author: <span class="mathjax">Xu, G</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> <div class="content"> <form method="GET" action="/search/eess" aria-role="search"> Searching in archive <strong>eess</strong>. <a href="/search/?searchtype=author&amp;query=Xu%2C+G">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Xu, G"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Xu%2C+G&amp;terms-0-field=author&amp;size=50&amp;order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Xu, G"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09561">arXiv:2502.09561</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.09561">pdf</a>, <a href="https://arxiv.org/format/2502.09561">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Traffic Safety Analysis with Digital Twin Technology: Integrating Vehicle Dynamics and Environmental Factors into Microscopic Traffic Simulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Xu%2C+G">Guanhao Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+J">Jianfei Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+Z">Zejiang Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Zhou%2C+A">Anye Zhou</a>, <a href="/search/eess?searchtype=author&amp;query=Schrader%2C+M">Max Schrader</a>, <a href="/search/eess?searchtype=author&amp;query=Bittle%2C+J">Joshua Bittle</a>, <a href="/search/eess?searchtype=author&amp;query=Shao%2C+Y">Yunli Shao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09561v1-abstract-short" style="display: inline;"> Traffic safety is a critical concern in transportation engineering and urban planning. Traditional traffic safety analysis requires trained observers to collect data in the field, which is time-consuming, labor-intensive, and sometimes inaccurate. In recent years, microscopic traffic simulation, which simulates individual vehicles&#39; movements within a transportation network, have been utilized to s&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09561v1-abstract-full').style.display = 'inline'; document.getElementById('2502.09561v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09561v1-abstract-full" style="display: none;"> Traffic safety is a critical concern in transportation engineering and urban planning. Traditional traffic safety analysis requires trained observers to collect data in the field, which is time-consuming, labor-intensive, and sometimes inaccurate. In recent years, microscopic traffic simulation, which simulates individual vehicles&#39; movements within a transportation network, have been utilized to study traffic safety. However, microscopic traffic simulation only focuses on traffic-related factors, such as traffic volume, traffic signals, and lane configurations, neglecting vehicle dynamics and environment-related factors like weather and lighting conditions, which can significantly impact traffic safety. In light of this, this paper explores the application of digital twin technology in traffic safety analysis, integrating vehicle simulators, which consider vehicle dynamics and environmental factors, and microscopic traffic simulators, which simulate the operations of traffic flow, for enhanced safety evaluations. Various scenarios, including different weather conditions and visibility levels, are simulated using a digital twin of a road segment in Tuscaloosa, Alabama. The simulations employ Surrogate Safety Measures (SSMs) like Time to Collision (TTC) and Deceleration Rate to Avoid a Crash (DRAC) to assess safety under varying conditions. The results demonstrate that traffic digital twin can identify potential safety issues that traditional microscopic simulation cannot, providing insights for improving traffic control strategies and transportation infrastructure to enhance traffic safety. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09561v1-abstract-full').style.display = 'none'; document.getElementById('2502.09561v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.00139">arXiv:2502.00139</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.00139">pdf</a>, <a href="https://arxiv.org/format/2502.00139">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Beamforming with Joint Phase and Time Array: System Design, Prototyping and Performance </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Mo%2C+J">Jianhua Mo</a>, <a href="/search/eess?searchtype=author&amp;query=AlAmmouri%2C+A">Ahmad AlAmmouri</a>, <a href="/search/eess?searchtype=author&amp;query=Dong%2C+S">Shenggang Dong</a>, <a href="/search/eess?searchtype=author&amp;query=Nam%2C+Y">Younghan Nam</a>, <a href="/search/eess?searchtype=author&amp;query=Choi%2C+W">Won-Suk Choi</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+G">Gary Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Jianzhong"> Jianzhong</a>, <a href="/search/eess?searchtype=author&amp;query=Zhan"> Zhan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.00139v1-abstract-short" style="display: inline;"> Joint phase-time arrays (JPTA) is a new mmWave radio frequency front-end architecture constructed with appending time-delay elements to phase shifters for analog beamforming. JPTA allows the mmWave base station (BS) to form multiple frequency-dependent beams with a single RF chain, exploiting the extra degrees of freedom the time-delay elements offer. Without requiring extra power-hungry RF chains&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00139v1-abstract-full').style.display = 'inline'; document.getElementById('2502.00139v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.00139v1-abstract-full" style="display: none;"> Joint phase-time arrays (JPTA) is a new mmWave radio frequency front-end architecture constructed with appending time-delay elements to phase shifters for analog beamforming. JPTA allows the mmWave base station (BS) to form multiple frequency-dependent beams with a single RF chain, exploiting the extra degrees of freedom the time-delay elements offer. Without requiring extra power-hungry RF chains, a BS with JPTA can schedule multiple users in different directions in a frequency-division multiplexing (FDM) manner. A BS with JPTA achieves various advantages over the traditional analog beamforming system. Simulation results show that JPTA can bring significant system-level benefits, e.g., extending uplink throughput coverage by 100%. To realize these system benefits of JPTA, high-resolution delay elements with a wide delay dynamic range are essential. With newly developed delay elements, we demonstrate that a single TRX RF chain can serve four users in four different directions in the mmWave band. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00139v1-abstract-full').style.display = 'none'; document.getElementById('2502.00139v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Presented at Asilomar Conference on Signals, Systems, and Computers 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.08139">arXiv:2501.08139</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2501.08139">pdf</a>, <a href="https://arxiv.org/format/2501.08139">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> EEG-ReMinD: Enhancing Neurodegenerative EEG Decoding through Self-Supervised State Reconstruction-Primed Riemannian Dynamics </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Wang%2C+Z">Zirui Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Song%2C+Z">Zhenxi Song</a>, <a href="/search/eess?searchtype=author&amp;query=Guo%2C+Y">Yi Guo</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+Y">Yuxin Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+G">Guoyang Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+M">Min Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+Z">Zhiguo Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.08139v1-abstract-short" style="display: inline;"> The development of EEG decoding algorithms confronts challenges such as data sparsity, subject variability, and the need for precise annotations, all of which are vital for advancing brain-computer interfaces and enhancing the diagnosis of diseases. To address these issues, we propose a novel two-stage approach named Self-Supervised State Reconstruction-Primed Riemannian Dynamics (EEG-ReMinD) , wh&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.08139v1-abstract-full').style.display = 'inline'; document.getElementById('2501.08139v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.08139v1-abstract-full" style="display: none;"> The development of EEG decoding algorithms confronts challenges such as data sparsity, subject variability, and the need for precise annotations, all of which are vital for advancing brain-computer interfaces and enhancing the diagnosis of diseases. To address these issues, we propose a novel two-stage approach named Self-Supervised State Reconstruction-Primed Riemannian Dynamics (EEG-ReMinD) , which mitigates reliance on supervised learning and integrates inherent geometric features. This approach efficiently handles EEG data corruptions and reduces the dependency on labels. EEG-ReMinD utilizes self-supervised and geometric learning techniques, along with an attention mechanism, to analyze the temporal dynamics of EEG features within the framework of Riemannian geometry, referred to as Riemannian dynamics. Comparative analyses on both intact and corrupted datasets from two different neurodegenerative disorders underscore the enhanced performance of EEG-ReMinD. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.08139v1-abstract-full').style.display = 'none'; document.getElementById('2501.08139v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.16928">arXiv:2412.16928</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.16928">pdf</a>, <a href="https://arxiv.org/format/2412.16928">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> AV-DTEC: Self-Supervised Audio-Visual Fusion for Drone Trajectory Estimation and Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Xiao%2C+Z">Zhenyuan Xiao</a>, <a href="/search/eess?searchtype=author&amp;query=Yang%2C+Y">Yizhuo Yang</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+G">Guili Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Zeng%2C+X">Xianglong Zeng</a>, <a href="/search/eess?searchtype=author&amp;query=Yuan%2C+S">Shenghai Yuan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.16928v1-abstract-short" style="display: inline;"> The increasing use of compact UAVs has created significant threats to public safety, while traditional drone detection systems are often bulky and costly. To address these challenges, we propose AV-DTEC, a lightweight self-supervised audio-visual fusion-based anti-UAV system. AV-DTEC is trained using self-supervised learning with labels generated by LiDAR, and it simultaneously learns audio and vi&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16928v1-abstract-full').style.display = 'inline'; document.getElementById('2412.16928v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.16928v1-abstract-full" style="display: none;"> The increasing use of compact UAVs has created significant threats to public safety, while traditional drone detection systems are often bulky and costly. To address these challenges, we propose AV-DTEC, a lightweight self-supervised audio-visual fusion-based anti-UAV system. AV-DTEC is trained using self-supervised learning with labels generated by LiDAR, and it simultaneously learns audio and visual features through a parallel selective state-space model. With the learned features, a specially designed plug-and-play primary-auxiliary feature enhancement module integrates visual features into audio features for better robustness in cross-lighting conditions. To reduce reliance on auxiliary features and align modalities, we propose a teacher-student model that adaptively adjusts the weighting of visual features. AV-DTEC demonstrates exceptional accuracy and effectiveness in real-world multi-modality data. The code and trained models are publicly accessible on GitHub \url{https://github.com/AmazingDay1/AV-DETC}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16928v1-abstract-full').style.display = 'none'; document.getElementById('2412.16928v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to ICRA 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.13037">arXiv:2412.13037</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.13037">pdf</a>, <a href="https://arxiv.org/format/2412.13037">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> TAME: Temporal Audio-based Mamba for Enhanced Drone Trajectory Estimation and Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Xiao%2C+Z">Zhenyuan Xiao</a>, <a href="/search/eess?searchtype=author&amp;query=Hu%2C+H">Huanran Hu</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+G">Guili Xu</a>, <a href="/search/eess?searchtype=author&amp;query=He%2C+J">Junwei He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.13037v6-abstract-short" style="display: inline;"> The increasing prevalence of compact UAVs has introduced significant risks to public safety, while traditional drone detection systems are often bulky and costly. To address these challenges, we present TAME, the Temporal Audio-based Mamba for Enhanced Drone Trajectory Estimation and Classification. This innovative anti-UAV detection model leverages a parallel selective state-space model to simult&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13037v6-abstract-full').style.display = 'inline'; document.getElementById('2412.13037v6-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.13037v6-abstract-full" style="display: none;"> The increasing prevalence of compact UAVs has introduced significant risks to public safety, while traditional drone detection systems are often bulky and costly. To address these challenges, we present TAME, the Temporal Audio-based Mamba for Enhanced Drone Trajectory Estimation and Classification. This innovative anti-UAV detection model leverages a parallel selective state-space model to simultaneously capture and learn both the temporal and spectral features of audio, effectively analyzing propagation of sound. To further enhance temporal features, we introduce a Temporal Feature Enhancement Module, which integrates spectral features into temporal data using residual cross-attention. This enhanced temporal information is then employed for precise 3D trajectory estimation and classification. Our model sets a new standard of performance on the MMUAD benchmarks, demonstrating superior accuracy and effectiveness. The code and trained models are publicly available on GitHub \url{https://github.com/AmazingDay1/TAME}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13037v6-abstract-full').style.display = 'none'; document.getElementById('2412.13037v6-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper has been accepted for presentation at the IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP) 2025. Personal use of this material is permitted. Permission from IEEE must be obtained for all other uses</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13766">arXiv:2411.13766</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.13766">pdf</a>, <a href="https://arxiv.org/format/2411.13766">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Tiny-Align: Bridging Automatic Speech Recognition and Large Language Model on the Edge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Qin%2C+R">Ruiyang Qin</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+D">Dancheng Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+G">Gelei Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Yan%2C+Z">Zheyu Yan</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+C">Chenhui Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Hu%2C+Y">Yuting Hu</a>, <a href="/search/eess?searchtype=author&amp;query=Hu%2C+X+S">X. Sharon Hu</a>, <a href="/search/eess?searchtype=author&amp;query=Xiong%2C+J">Jinjun Xiong</a>, <a href="/search/eess?searchtype=author&amp;query=Shi%2C+Y">Yiyu Shi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13766v2-abstract-short" style="display: inline;"> The combination of Large Language Models (LLM) and Automatic Speech Recognition (ASR), when deployed on edge devices (called edge ASR-LLM), can serve as a powerful personalized assistant to enable audio-based interaction for users. Compared to text-based interaction, edge ASR-LLM allows accessible and natural audio interactions. Unfortunately, existing ASR-LLM models are mainly trained in high-per&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13766v2-abstract-full').style.display = 'inline'; document.getElementById('2411.13766v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13766v2-abstract-full" style="display: none;"> The combination of Large Language Models (LLM) and Automatic Speech Recognition (ASR), when deployed on edge devices (called edge ASR-LLM), can serve as a powerful personalized assistant to enable audio-based interaction for users. Compared to text-based interaction, edge ASR-LLM allows accessible and natural audio interactions. Unfortunately, existing ASR-LLM models are mainly trained in high-performance computing environments and produce substantial model weights, making them difficult to deploy on edge devices. More importantly, to better serve users&#39; personalized needs, the ASR-LLM must be able to learn from each distinct user, given that audio input often contains highly personalized characteristics that necessitate personalized on-device training. Since individually fine-tuning the ASR or LLM often leads to suboptimal results due to modality-specific limitations, end-to-end training ensures seamless integration of audio features and language understanding (cross-modal alignment), ultimately enabling a more personalized and efficient adaptation on edge devices. However, due to the complex training requirements and substantial computational demands of existing approaches, cross-modal alignment between ASR audio and LLM can be challenging on edge devices. In this work, we propose a resource-efficient cross-modal alignment framework that bridges ASR and LLMs on edge devices to handle personalized audio input. Our framework enables efficient ASR-LLM alignment on resource-constrained devices like NVIDIA Jetson Orin (8GB RAM), achieving 50x training time speedup while improving the alignment quality by more than 50\%. To the best of our knowledge, this is the first work to study efficient ASR-LLM alignment on resource-constrained edge devices. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13766v2-abstract-full').style.display = 'none'; document.getElementById('2411.13766v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages, 8 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11116">arXiv:2411.11116</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.11116">pdf</a>, <a href="https://arxiv.org/ps/2411.11116">ps</a>, <a href="https://arxiv.org/format/2411.11116">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DBF-Net: A Dual-Branch Network with Feature Fusion for Ultrasound Image Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Xu%2C+G">Guoping Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Wu%2C+X">Ximing Wu</a>, <a href="/search/eess?searchtype=author&amp;query=Liao%2C+W">Wentao Liao</a>, <a href="/search/eess?searchtype=author&amp;query=Wu%2C+X">Xinglong Wu</a>, <a href="/search/eess?searchtype=author&amp;query=Huang%2C+Q">Qing Huang</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+C">Chang Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11116v1-abstract-short" style="display: inline;"> Accurately segmenting lesions in ultrasound images is challenging due to the difficulty in distinguishing boundaries between lesions and surrounding tissues. While deep learning has improved segmentation accuracy, there is limited focus on boundary quality and its relationship with body structures. To address this, we introduce UBBS-Net, a dual-branch deep neural network that learns the relationsh&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11116v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11116v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11116v1-abstract-full" style="display: none;"> Accurately segmenting lesions in ultrasound images is challenging due to the difficulty in distinguishing boundaries between lesions and surrounding tissues. While deep learning has improved segmentation accuracy, there is limited focus on boundary quality and its relationship with body structures. To address this, we introduce UBBS-Net, a dual-branch deep neural network that learns the relationship between body and boundary for improved segmentation. We also propose a feature fusion module to integrate body and boundary information. Evaluated on three public datasets, UBBS-Net outperforms existing methods, achieving Dice Similarity Coefficients of 81.05% for breast cancer, 76.41% for brachial plexus nerves, and 87.75% for infantile hemangioma segmentation. Our results demonstrate the effectiveness of UBBS-Net for ultrasound image segmentation. The code is available at https://github.com/apple1986/DBF-Net. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11116v1-abstract-full').style.display = 'none'; document.getElementById('2411.11116v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05771">arXiv:2411.05771</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.05771">pdf</a>, <a href="https://arxiv.org/format/2411.05771">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> </div> </div> <p class="title is-5 mathjax"> Sketched Equivariant Imaging Regularization and Deep Internal Learning for Inverse Problems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Xu%2C+G">Guixian Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+J">Jinglai Li</a>, <a href="/search/eess?searchtype=author&amp;query=Tang%2C+J">Junqi Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05771v3-abstract-short" style="display: inline;"> Equivariant Imaging (EI) regularization has become the de-facto technique for unsupervised training of deep imaging networks, without any need of ground-truth data. Observing that the EI-based unsupervised training paradigm currently has significant computational redundancy leading to inefficiency in high-dimensional applications, we propose a sketched EI regularization which leverages the randomi&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05771v3-abstract-full').style.display = 'inline'; document.getElementById('2411.05771v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05771v3-abstract-full" style="display: none;"> Equivariant Imaging (EI) regularization has become the de-facto technique for unsupervised training of deep imaging networks, without any need of ground-truth data. Observing that the EI-based unsupervised training paradigm currently has significant computational redundancy leading to inefficiency in high-dimensional applications, we propose a sketched EI regularization which leverages the randomized sketching techniques for acceleration. We then extend our sketched EI regularization to develop an accelerated deep internal learning framework, Sketched Equivariant Deep Image Prior (Sk-EI-DIP), which can be efficiently applied for single-image and task-adapted reconstruction. Additionally, for network adaptation tasks, we propose a parameter-efficient approach for accelerating both EI-DIP and Sk-EI-DIP via optimizing only the normalization layers. Our numerical study on X-ray CT and multi-coil MRI image reconstruction tasks demonstrate that our approach can achieve significant computational acceleration over standard EI-based counterpart in single-input setting and network adaptation at test time. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05771v3-abstract-full').style.display = 'none'; document.getElementById('2411.05771v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">22 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04541">arXiv:2411.04541</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.04541">pdf</a>, <a href="https://arxiv.org/format/2411.04541">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Low Complexity Joint Chromatic Dispersion and Time/Frequency Offset Estimation Based on Fractional Fourier Transform </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Xu%2C+G">Guozhi Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Niu%2C+Z">Zekun Niu</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+L">Lyu Li</a>, <a href="/search/eess?searchtype=author&amp;query=Hu%2C+W">Weisheng Hu</a>, <a href="/search/eess?searchtype=author&amp;query=Yi%2C+L">Lilin Yi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04541v1-abstract-short" style="display: inline;"> We propose and experimentally validate a joint estimation method for chromatic dispersion and time-frequency offset based on the fractional Fourier transform, which reduces computational complexity by more than 50% while keeping estimation accuracy. </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04541v1-abstract-full" style="display: none;"> We propose and experimentally validate a joint estimation method for chromatic dispersion and time-frequency offset based on the fractional Fourier transform, which reduces computational complexity by more than 50% while keeping estimation accuracy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04541v1-abstract-full').style.display = 'none'; document.getElementById('2411.04541v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 5 figures, 1 table, ACPIPOC2024 accept</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00813">arXiv:2411.00813</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.00813">pdf</a>, <a href="https://arxiv.org/format/2411.00813">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Personality Analysis from Online Short Video Platforms with Multi-domain Adaptation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=An%2C+S">Sixu An</a>, <a href="/search/eess?searchtype=author&amp;query=Sun%2C+X">Xiangguo Sun</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+Y">Yicong Li</a>, <a href="/search/eess?searchtype=author&amp;query=Yang%2C+Y">Yu Yang</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+G">Guandong Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00813v1-abstract-short" style="display: inline;"> Personality analysis from online short videos has gained prominence due to its applications in personalized recommendation systems, sentiment analysis, and human-computer interaction. Traditional assessment methods, such as questionnaires based on the Big Five Personality Framework, are limited by self-report biases and are impractical for large-scale or real-time analysis. Leveraging the rich, mu&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00813v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00813v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00813v1-abstract-full" style="display: none;"> Personality analysis from online short videos has gained prominence due to its applications in personalized recommendation systems, sentiment analysis, and human-computer interaction. Traditional assessment methods, such as questionnaires based on the Big Five Personality Framework, are limited by self-report biases and are impractical for large-scale or real-time analysis. Leveraging the rich, multi-modal data present in short videos offers a promising alternative for more accurate personality inference. However, integrating these diverse and asynchronous modalities poses significant challenges, particularly in aligning time-varying data and ensuring models generalize well to new domains with limited labeled data. In this paper, we propose a novel multi-modal personality analysis framework that addresses these challenges by synchronizing and integrating features from multiple modalities and enhancing model generalization through domain adaptation. We introduce a timestamp-based modality alignment mechanism that synchronizes data based on spoken word timestamps, ensuring accurate correspondence across modalities and facilitating effective feature integration. To capture temporal dependencies and inter-modal interactions, we employ Bidirectional Long Short-Term Memory networks and self-attention mechanisms, allowing the model to focus on the most informative features for personality prediction. Furthermore, we develop a gradient-based domain adaptation method that transfers knowledge from multiple source domains to improve performance in target domains with scarce labeled data. Extensive experiments on real-world datasets demonstrate that our framework significantly outperforms existing methods in personality prediction tasks, highlighting its effectiveness in capturing complex behavioral cues and robustness in adapting to new domains. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00813v1-abstract-full').style.display = 'none'; document.getElementById('2411.00813v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.08228">arXiv:2408.08228</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.08228">pdf</a>, <a href="https://arxiv.org/format/2408.08228">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Rethinking Medical Anomaly Detection in Brain MRI: An Image Quality Assessment Perspective </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Pan%2C+Z">Zixuan Pan</a>, <a href="/search/eess?searchtype=author&amp;query=Xia%2C+J">Jun Xia</a>, <a href="/search/eess?searchtype=author&amp;query=Yan%2C+Z">Zheyu Yan</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+G">Guoyue Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Wu%2C+Y">Yawen Wu</a>, <a href="/search/eess?searchtype=author&amp;query=Jia%2C+Z">Zhenge Jia</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+J">Jianxu Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Shi%2C+Y">Yiyu Shi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.08228v1-abstract-short" style="display: inline;"> Reconstruction-based methods, particularly those leveraging autoencoders, have been widely adopted to perform anomaly detection in brain MRI. While most existing works try to improve detection accuracy by proposing new model structures or algorithms, we tackle the problem through image quality assessment, an underexplored perspective in the field. We propose a fusion quality loss function that com&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.08228v1-abstract-full').style.display = 'inline'; document.getElementById('2408.08228v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.08228v1-abstract-full" style="display: none;"> Reconstruction-based methods, particularly those leveraging autoencoders, have been widely adopted to perform anomaly detection in brain MRI. While most existing works try to improve detection accuracy by proposing new model structures or algorithms, we tackle the problem through image quality assessment, an underexplored perspective in the field. We propose a fusion quality loss function that combines Structural Similarity Index Measure loss with l1 loss, offering a more comprehensive evaluation of reconstruction quality. Additionally, we introduce a data pre-processing strategy that enhances the average intensity ratio (AIR) between normal and abnormal regions, further improving the distinction of anomalies. By fusing the aforementioned two methods, we devise the image quality assessment (IQA) approach. The proposed IQA approach achieves significant improvements (&gt;10%) in terms of Dice coefficient (DICE) and Area Under the Precision-Recall Curve (AUPRC) on the BraTS21 (T2, FLAIR) and MSULB datasets when compared with state-of-the-art methods. These results highlight the importance of invoking the comprehensive image quality assessment in medical anomaly detection and provide a new perspective for future research in this field. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.08228v1-abstract-full').style.display = 'none'; document.getElementById('2408.08228v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.01725">arXiv:2405.01725</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2405.01725">pdf</a>, <a href="https://arxiv.org/format/2405.01725">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Development of Skip Connection in Deep Neural Networks for Computer Vision and Medical Image Analysis: A Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Xu%2C+G">Guoping Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+X">Xiaxia Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Wu%2C+X">Xinglong Wu</a>, <a href="/search/eess?searchtype=author&amp;query=Leng%2C+X">Xuesong Leng</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+Y">Yongchao Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.01725v1-abstract-short" style="display: inline;"> Deep learning has made significant progress in computer vision, specifically in image classification, object detection, and semantic segmentation. The skip connection has played an essential role in the architecture of deep neural networks,enabling easier optimization through residual learning during the training stage and improving accuracy during testing. Many neural networks have inherited the&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.01725v1-abstract-full').style.display = 'inline'; document.getElementById('2405.01725v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.01725v1-abstract-full" style="display: none;"> Deep learning has made significant progress in computer vision, specifically in image classification, object detection, and semantic segmentation. The skip connection has played an essential role in the architecture of deep neural networks,enabling easier optimization through residual learning during the training stage and improving accuracy during testing. Many neural networks have inherited the idea of residual learning with skip connections for various tasks, and it has been the standard choice for designing neural networks. This survey provides a comprehensive summary and outlook on the development of skip connections in deep neural networks. The short history of skip connections is outlined, and the development of residual learning in deep neural networks is surveyed. The effectiveness of skip connections in the training and testing stages is summarized, and future directions for using skip connections in residual learning are discussed. Finally, we summarize seminal papers, source code, models, and datasets that utilize skip connections in computer vision, including image classification, object detection, semantic segmentation, and image reconstruction. We hope this survey could inspire peer researchers in the community to develop further skip connections in various forms and tasks and the theory of residual learning in deep neural networks. The project page can be found at https://github.com/apple1986/Residual_Learning_For_Images <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.01725v1-abstract-full').style.display = 'none'; document.getElementById('2405.01725v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.15854">arXiv:2404.15854</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2404.15854">pdf</a>, <a href="https://arxiv.org/format/2404.15854">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> CLAD: Robust Audio Deepfake Detection Against Manipulation Attacks with Contrastive Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Wu%2C+H">Haolin Wu</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+J">Jing Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Du%2C+R">Ruiying Du</a>, <a href="/search/eess?searchtype=author&amp;query=Wu%2C+C">Cong Wu</a>, <a href="/search/eess?searchtype=author&amp;query=He%2C+K">Kun He</a>, <a href="/search/eess?searchtype=author&amp;query=Shang%2C+X">Xingcan Shang</a>, <a href="/search/eess?searchtype=author&amp;query=Ren%2C+H">Hao Ren</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+G">Guowen Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.15854v1-abstract-short" style="display: inline;"> The increasing prevalence of audio deepfakes poses significant security threats, necessitating robust detection methods. While existing detection systems exhibit promise, their robustness against malicious audio manipulations remains underexplored. To bridge the gap, we undertake the first comprehensive study of the susceptibility of the most widely adopted audio deepfake detectors to manipulation&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.15854v1-abstract-full').style.display = 'inline'; document.getElementById('2404.15854v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.15854v1-abstract-full" style="display: none;"> The increasing prevalence of audio deepfakes poses significant security threats, necessitating robust detection methods. While existing detection systems exhibit promise, their robustness against malicious audio manipulations remains underexplored. To bridge the gap, we undertake the first comprehensive study of the susceptibility of the most widely adopted audio deepfake detectors to manipulation attacks. Surprisingly, even manipulations like volume control can significantly bypass detection without affecting human perception. To address this, we propose CLAD (Contrastive Learning-based Audio deepfake Detector) to enhance the robustness against manipulation attacks. The key idea is to incorporate contrastive learning to minimize the variations introduced by manipulations, therefore enhancing detection robustness. Additionally, we incorporate a length loss, aiming to improve the detection accuracy by clustering real audios more closely in the feature space. We comprehensively evaluated the most widely adopted audio deepfake detection models and our proposed CLAD against various manipulation attacks. The detection models exhibited vulnerabilities, with FAR rising to 36.69%, 31.23%, and 51.28% under volume control, fading, and noise injection, respectively. CLAD enhanced robustness, reducing the FAR to 0.81% under noise injection and consistently maintaining an FAR below 1.63% across all tests. Our source code and documentation are available in the artifact repository (https://github.com/CLAD23/CLAD). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.15854v1-abstract-full').style.display = 'none'; document.getElementById('2404.15854v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to IEEE TDSC</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.10980">arXiv:2403.10980</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2403.10980">pdf</a>, <a href="https://arxiv.org/format/2403.10980">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> </div> </div> <p class="title is-5 mathjax"> Inverse learning of black-box aggregator for robust Nash equilibrium </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Chen%2C+G">Guanpu Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+G">Gehui Xu</a>, <a href="/search/eess?searchtype=author&amp;query=He%2C+F">Fengxiang He</a>, <a href="/search/eess?searchtype=author&amp;query=Tao%2C+D">Dacheng Tao</a>, <a href="/search/eess?searchtype=author&amp;query=Parisini%2C+T">Thomas Parisini</a>, <a href="/search/eess?searchtype=author&amp;query=Johansson%2C+K+H">Karl Henrik Johansson</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.10980v1-abstract-short" style="display: inline;"> In this note, we investigate the robustness of Nash equilibria (NE) in multi-player aggregative games with coupling constraints. There are many algorithms for computing an NE of an aggregative game given a known aggregator. When the coupling parameters are affected by uncertainty, robust NE need to be computed. We consider a scenario where players&#39; weight in the aggregator is unknown, making the a&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.10980v1-abstract-full').style.display = 'inline'; document.getElementById('2403.10980v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.10980v1-abstract-full" style="display: none;"> In this note, we investigate the robustness of Nash equilibria (NE) in multi-player aggregative games with coupling constraints. There are many algorithms for computing an NE of an aggregative game given a known aggregator. When the coupling parameters are affected by uncertainty, robust NE need to be computed. We consider a scenario where players&#39; weight in the aggregator is unknown, making the aggregator kind of &#34;a black box&#34;. We pursue a suitable learning approach to estimate the unknown aggregator by proposing an inverse variational inequality-based relationship. We then utilize the counterpart to reconstruct the game and obtain first-order conditions for robust NE in the worst case. Furthermore, we characterize the generalization property of the learning methodology via an upper bound on the violation probability. Simulation experiments show the effectiveness of the proposed inverse learning approach. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.10980v1-abstract-full').style.display = 'none'; document.getElementById('2403.10980v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.10949">arXiv:2312.10949</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2312.10949">pdf</a>, <a href="https://arxiv.org/format/2312.10949">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1007/978-3-031-05936-0_31">10.1007/978-3-031-05936-0_31 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Leveraged Mel spectrograms using Harmonic and Percussive Components in Speech Emotion Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Rudd%2C+D+H">David Hason Rudd</a>, <a href="/search/eess?searchtype=author&amp;query=Huo%2C+H">Huan Huo</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+G">Guandong Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.10949v1-abstract-short" style="display: inline;"> Speech Emotion Recognition (SER) affective technology enables the intelligent embedded devices to interact with sensitivity. Similarly, call centre employees recognise customers&#39; emotions from their pitch, energy, and tone of voice so as to modify their speech for a high-quality interaction with customers. This work explores, for the first time, the effects of the harmonic and percussive component&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.10949v1-abstract-full').style.display = 'inline'; document.getElementById('2312.10949v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.10949v1-abstract-full" style="display: none;"> Speech Emotion Recognition (SER) affective technology enables the intelligent embedded devices to interact with sensitivity. Similarly, call centre employees recognise customers&#39; emotions from their pitch, energy, and tone of voice so as to modify their speech for a high-quality interaction with customers. This work explores, for the first time, the effects of the harmonic and percussive components of Mel spectrograms in SER. We attempt to leverage the Mel spectrogram by decomposing distinguishable acoustic features for exploitation in our proposed architecture, which includes a novel feature map generator algorithm, a CNN-based network feature extractor and a multi-layer perceptron (MLP) classifier. This study specifically focuses on effective data augmentation techniques for building an enriched hybrid-based feature map. This process results in a function that outputs a 2D image so that it can be used as input data for a pre-trained CNN-VGG16 feature extractor. Furthermore, we also investigate other acoustic features such as MFCCs, chromagram, spectral contrast, and the tonnetz to assess our proposed framework. A test accuracy of 92.79% on the Berlin EMO-DB database is achieved. Our result is higher than previous works using CNN-VGG16. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.10949v1-abstract-full').style.display = 'none'; document.getElementById('2312.10949v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Advances in Knowledge Discovery and Data Mining. PAKDD 2022. Lecture Notes in Computer Science(), vol 13281. Springer, Cham </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.10937">arXiv:2312.10937</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2312.10937">pdf</a>, <a href="https://arxiv.org/format/2312.10937">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1007/978-3-031-33380-4_17">10.1007/978-3-031-33380-4_17 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> An Extended Variational Mode Decomposition Algorithm Developed Speech Emotion Recognition Performance </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Rudd%2C+D+H">David Hason Rudd</a>, <a href="/search/eess?searchtype=author&amp;query=Huo%2C+H">Huan Huo</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+G">Guandong Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.10937v1-abstract-short" style="display: inline;"> Emotion recognition (ER) from speech signals is a robust approach since it cannot be imitated like facial expression or text based sentiment analysis. Valuable information underlying the emotions are significant for human-computer interactions enabling intelligent machines to interact with sensitivity in the real world. Previous ER studies through speech signal processing have focused exclusively&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.10937v1-abstract-full').style.display = 'inline'; document.getElementById('2312.10937v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.10937v1-abstract-full" style="display: none;"> Emotion recognition (ER) from speech signals is a robust approach since it cannot be imitated like facial expression or text based sentiment analysis. Valuable information underlying the emotions are significant for human-computer interactions enabling intelligent machines to interact with sensitivity in the real world. Previous ER studies through speech signal processing have focused exclusively on associations between different signal mode decomposition methods and hidden informative features. However, improper decomposition parameter selections lead to informative signal component losses due to mode duplicating and mixing. In contrast, the current study proposes VGG-optiVMD, an empowered variational mode decomposition algorithm, to distinguish meaningful speech features and automatically select the number of decomposed modes and optimum balancing parameter for the data fidelity constraint by assessing their effects on the VGG16 flattening output layer. Various feature vectors were employed to train the VGG16 network on different databases and assess VGG-optiVMD reproducibility and reliability. One, two, and three-dimensional feature vectors were constructed by concatenating Mel-frequency cepstral coefficients, Chromagram, Mel spectrograms, Tonnetz diagrams, and spectral centroids. Results confirmed a synergistic relationship between the fine-tuning of the signal sample rate and decomposition parameters with classification accuracy, achieving state-of-the-art 96.09% accuracy in predicting seven emotions on the Berlin EMO-DB database. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.10937v1-abstract-full').style.display = 'none'; document.getElementById('2312.10937v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Advances in Knowledge Discovery and Data Mining. PAKDD 2023. Lecture Notes in Computer Science(), vol 13937. Springer, Cham </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.15831">arXiv:2310.15831</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2310.15831">pdf</a>, <a href="https://arxiv.org/format/2310.15831">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> A Comparative Study of Variational Autoencoders, Normalizing Flows, and Score-based Diffusion Models for Electrical Impedance Tomography </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Wang%2C+H">Huihui Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+G">Guixian Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Zhou%2C+Q">Qingping Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.15831v3-abstract-short" style="display: inline;"> Electrical Impedance Tomography (EIT) is a widely employed imaging technique in industrial inspection, geophysical prospecting, and medical imaging. However, the inherent nonlinearity and ill-posedness of EIT image reconstruction present challenges for classical regularization techniques, such as the critical selection of regularization terms and the lack of prior knowledge. Deep generative models&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.15831v3-abstract-full').style.display = 'inline'; document.getElementById('2310.15831v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.15831v3-abstract-full" style="display: none;"> Electrical Impedance Tomography (EIT) is a widely employed imaging technique in industrial inspection, geophysical prospecting, and medical imaging. However, the inherent nonlinearity and ill-posedness of EIT image reconstruction present challenges for classical regularization techniques, such as the critical selection of regularization terms and the lack of prior knowledge. Deep generative models (DGMs) have been shown to play a crucial role in learning implicit regularizers and prior knowledge. This study aims to investigate the potential of three DGMs-variational autoencoder networks, normalizing flow, and score-based diffusion model-to learn implicit regularizers in learning-based EIT imaging. We first introduce background information on EIT imaging and its inverse problem formulation. Next, we propose three algorithms for performing EIT inverse problems based on corresponding DGMs. Finally, we present numerical and visual experiments, which reveal that (1) no single method consistently outperforms the others across all settings, and (2) when reconstructing an object with 2 anomalies using a well-trained model based on a training dataset containing 4 anomalies, the conditional normalizing flow model (CNF) exhibits the best generalization in low-level noise, while the conditional score-based diffusion model (CSD*) demonstrates the best generalization in high-level noise settings. We hope our preliminary efforts will encourage other researchers to assess their DGMs in EIT and other nonlinear inverse problems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.15831v3-abstract-full').style.display = 'none'; document.getElementById('2310.15831v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2307.07829">arXiv:2307.07829</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2307.07829">pdf</a>, <a href="https://arxiv.org/format/2307.07829">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> HQG-Net: Unpaired Medical Image Enhancement with High-Quality Guidance </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=He%2C+C">Chunming He</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+K">Kai Li</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+G">Guoxia Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Yan%2C+J">Jiangpeng Yan</a>, <a href="/search/eess?searchtype=author&amp;query=Tang%2C+L">Longxiang Tang</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+Y">Yulun Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+X">Xiu Li</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+Y">Yaowei Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2307.07829v1-abstract-short" style="display: inline;"> Unpaired Medical Image Enhancement (UMIE) aims to transform a low-quality (LQ) medical image into a high-quality (HQ) one without relying on paired images for training. While most existing approaches are based on Pix2Pix/CycleGAN and are effective to some extent, they fail to explicitly use HQ information to guide the enhancement process, which can lead to undesired artifacts and structural distor&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.07829v1-abstract-full').style.display = 'inline'; document.getElementById('2307.07829v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2307.07829v1-abstract-full" style="display: none;"> Unpaired Medical Image Enhancement (UMIE) aims to transform a low-quality (LQ) medical image into a high-quality (HQ) one without relying on paired images for training. While most existing approaches are based on Pix2Pix/CycleGAN and are effective to some extent, they fail to explicitly use HQ information to guide the enhancement process, which can lead to undesired artifacts and structural distortions. In this paper, we propose a novel UMIE approach that avoids the above limitation of existing methods by directly encoding HQ cues into the LQ enhancement process in a variational fashion and thus model the UMIE task under the joint distribution between the LQ and HQ domains. Specifically, we extract features from an HQ image and explicitly insert the features, which are expected to encode HQ cues, into the enhancement network to guide the LQ enhancement with the variational normalization module. We train the enhancement network adversarially with a discriminator to ensure the generated HQ image falls into the HQ domain. We further propose a content-aware loss to guide the enhancement process with wavelet-based pixel-level and multi-encoder-based feature-level constraints. Additionally, as a key motivation for performing image enhancement is to make the enhanced images serve better for downstream tasks, we propose a bi-level learning scheme to optimize the UMIE task and downstream tasks cooperatively, helping generate HQ images both visually appealing and favorable for downstream tasks. Experiments on three medical datasets, including two newly collected datasets, verify that the proposed method outperforms existing techniques in terms of both enhancement quality and downstream task performance. We will make the code and the newly collected datasets publicly available for community study. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.07829v1-abstract-full').style.display = 'none'; document.getElementById('2307.07829v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages, 10 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2208.14784">arXiv:2208.14784</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2208.14784">pdf</a>, <a href="https://arxiv.org/format/2208.14784">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> </div> </div> <p class="title is-5 mathjax"> Practical Operator Sketching Framework for Accelerating Iterative Data-Driven Solutions in Inverse Problems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Tang%2C+J">Junqi Tang</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+G">Guixian Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Mukherjee%2C+S">Subhadip Mukherjee</a>, <a href="/search/eess?searchtype=author&amp;query=Sch%C3%B6nlieb%2C+C">Carola-Bibiane Sch枚nlieb</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2208.14784v2-abstract-short" style="display: inline;"> We propose a new operator-sketching paradigm for designing efficient iterative data-driven reconstruction (IDR) schemes, e.g. Plug-and-Play algorithms and deep unrolling networks. These IDR schemes are currently the state-of-the-art solutions for imaging inverse problems. However, for high-dimensional imaging tasks, especially X-ray CT and MRI imaging, these IDR schemes typically become inefficien&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2208.14784v2-abstract-full').style.display = 'inline'; document.getElementById('2208.14784v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2208.14784v2-abstract-full" style="display: none;"> We propose a new operator-sketching paradigm for designing efficient iterative data-driven reconstruction (IDR) schemes, e.g. Plug-and-Play algorithms and deep unrolling networks. These IDR schemes are currently the state-of-the-art solutions for imaging inverse problems. However, for high-dimensional imaging tasks, especially X-ray CT and MRI imaging, these IDR schemes typically become inefficient both in terms of computation, due to the need of computing multiple times the high-dimensional forward and adjoint operators. In this work, we explore and propose a universal dimensionality reduction framework for accelerating IDR schemes in solving imaging inverse problems, based on leveraging the sketching techniques from stochastic optimization. Using this framework, we derive a number of accelerated IDR schemes, such as the plug-and-play multi-stage sketched gradient (PnP-MS2G) and sketching-based primal-dual (LSPD and Sk-LSPD) deep unrolling networks. Meanwhile, for fully accelerating PnP schemes when the denoisers are computationally expensive, we provide novel stochastic lazy denoising schemes (Lazy-PnP and Lazy-PnP-EQ), leveraging the ProxSkip scheme in optimization and equivariant image denoisers, which can massively accelerate the PnP algorithms with improved practicality. We provide theoretical analysis for recovery guarantees of instances of the proposed framework. Our numerical experiments on natural image processing and tomographic image reconstruction demonstrate the remarkable effectiveness of our sketched IDR schemes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2208.14784v2-abstract-full').style.display = 'none'; document.getElementById('2208.14784v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 August, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2208.14635">arXiv:2208.14635</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2208.14635">pdf</a>, <a href="https://arxiv.org/format/2208.14635">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Segmentation-guided Domain Adaptation and Data Harmonization of Multi-device Retinal Optical Coherence Tomography using Cycle-Consistent Generative Adversarial Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Chen%2C+S">Shuo Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Ma%2C+D">Da Ma</a>, <a href="/search/eess?searchtype=author&amp;query=Lee%2C+S">Sieun Lee</a>, <a href="/search/eess?searchtype=author&amp;query=Yu%2C+T+T+L">Timothy T. L. Yu</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+G">Gavin Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Lu%2C+D">Donghuan Lu</a>, <a href="/search/eess?searchtype=author&amp;query=Popuri%2C+K">Karteek Popuri</a>, <a href="/search/eess?searchtype=author&amp;query=Ju%2C+M+J">Myeong Jin Ju</a>, <a href="/search/eess?searchtype=author&amp;query=Sarunic%2C+M+V">Marinko V. Sarunic</a>, <a href="/search/eess?searchtype=author&amp;query=Beg%2C+M+F">Mirza Faisal Beg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2208.14635v1-abstract-short" style="display: inline;"> Optical Coherence Tomography(OCT) is a non-invasive technique capturing cross-sectional area of the retina in micro-meter resolutions. It has been widely used as a auxiliary imaging reference to detect eye-related pathology and predict longitudinal progression of the disease characteristics. Retina layer segmentation is one of the crucial feature extraction techniques, where the variations of reti&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2208.14635v1-abstract-full').style.display = 'inline'; document.getElementById('2208.14635v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2208.14635v1-abstract-full" style="display: none;"> Optical Coherence Tomography(OCT) is a non-invasive technique capturing cross-sectional area of the retina in micro-meter resolutions. It has been widely used as a auxiliary imaging reference to detect eye-related pathology and predict longitudinal progression of the disease characteristics. Retina layer segmentation is one of the crucial feature extraction techniques, where the variations of retinal layer thicknesses and the retinal layer deformation due to the presence of the fluid are highly correlated with multiple epidemic eye diseases like Diabetic Retinopathy(DR) and Age-related Macular Degeneration (AMD). However, these images are acquired from different devices, which have different intensity distribution, or in other words, belong to different imaging domains. This paper proposes a segmentation-guided domain-adaptation method to adapt images from multiple devices into single image domain, where the state-of-art pre-trained segmentation model is available. It avoids the time consumption of manual labelling for the upcoming new dataset and the re-training of the existing network. The semantic consistency and global feature consistency of the network will minimize the hallucination effect that many researchers reported regarding Cycle-Consistent Generative Adversarial Networks(CycleGAN) architecture. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2208.14635v1-abstract-full').style.display = 'none'; document.getElementById('2208.14635v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 August, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages, 10 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2205.14833">arXiv:2205.14833</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2205.14833">pdf</a>, <a href="https://arxiv.org/format/2205.14833">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Walle: An End-to-End, General-Purpose, and Large-Scale Production System for Device-Cloud Collaborative Machine Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Lv%2C+C">Chengfei Lv</a>, <a href="/search/eess?searchtype=author&amp;query=Niu%2C+C">Chaoyue Niu</a>, <a href="/search/eess?searchtype=author&amp;query=Gu%2C+R">Renjie Gu</a>, <a href="/search/eess?searchtype=author&amp;query=Jiang%2C+X">Xiaotang Jiang</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+Z">Zhaode Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+B">Bin Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Wu%2C+Z">Ziqi Wu</a>, <a href="/search/eess?searchtype=author&amp;query=Yao%2C+Q">Qiulin Yao</a>, <a href="/search/eess?searchtype=author&amp;query=Huang%2C+C">Congyu Huang</a>, <a href="/search/eess?searchtype=author&amp;query=Huang%2C+P">Panos Huang</a>, <a href="/search/eess?searchtype=author&amp;query=Huang%2C+T">Tao Huang</a>, <a href="/search/eess?searchtype=author&amp;query=Shu%2C+H">Hui Shu</a>, <a href="/search/eess?searchtype=author&amp;query=Song%2C+J">Jinde Song</a>, <a href="/search/eess?searchtype=author&amp;query=Zou%2C+B">Bin Zou</a>, <a href="/search/eess?searchtype=author&amp;query=Lan%2C+P">Peng Lan</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+G">Guohuan Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Wu%2C+F">Fei Wu</a>, <a href="/search/eess?searchtype=author&amp;query=Tang%2C+S">Shaojie Tang</a>, <a href="/search/eess?searchtype=author&amp;query=Wu%2C+F">Fan Wu</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+G">Guihai Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2205.14833v1-abstract-short" style="display: inline;"> To break the bottlenecks of mainstream cloud-based machine learning (ML) paradigm, we adopt device-cloud collaborative ML and build the first end-to-end and general-purpose system, called Walle, as the foundation. Walle consists of a deployment platform, distributing ML tasks to billion-scale devices in time; a data pipeline, efficiently preparing task input; and a compute container, providing a c&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2205.14833v1-abstract-full').style.display = 'inline'; document.getElementById('2205.14833v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2205.14833v1-abstract-full" style="display: none;"> To break the bottlenecks of mainstream cloud-based machine learning (ML) paradigm, we adopt device-cloud collaborative ML and build the first end-to-end and general-purpose system, called Walle, as the foundation. Walle consists of a deployment platform, distributing ML tasks to billion-scale devices in time; a data pipeline, efficiently preparing task input; and a compute container, providing a cross-platform and high-performance execution environment, while facilitating daily task iteration. Specifically, the compute container is based on Mobile Neural Network (MNN), a tensor compute engine along with the data processing and model execution libraries, which are exposed through a refined Python thread-level virtual machine (VM) to support diverse ML tasks and concurrent task execution. The core of MNN is the novel mechanisms of operator decomposition and semi-auto search, sharply reducing the workload in manually optimizing hundreds of operators for tens of hardware backends and further quickly identifying the best backend with runtime optimization for a computation graph. The data pipeline introduces an on-device stream processing framework to enable processing user behavior data at source. The deployment platform releases ML tasks with an efficient push-then-pull method and supports multi-granularity deployment policies. We evaluate Walle in practical e-commerce application scenarios to demonstrate its effectiveness, efficiency, and scalability. Extensive micro-benchmarks also highlight the superior performance of MNN and the Python thread-level VM. Walle has been in large-scale production use in Alibaba, while MNN has been open source with a broad impact in the community. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2205.14833v1-abstract-full').style.display = 'none'; document.getElementById('2205.14833v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 May, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by OSDI 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2202.09515">arXiv:2202.09515</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2202.09515">pdf</a>, <a href="https://arxiv.org/format/2202.09515">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SPNet: A novel deep neural network for retinal vessel segmentation based on shared decoder and pyramid-like loss </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Xu%2C+G">Geng-Xin Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Ren%2C+C">Chuan-Xian Ren</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2202.09515v1-abstract-short" style="display: inline;"> Segmentation of retinal vessel images is critical to the diagnosis of retinopathy. Recently, convolutional neural networks have shown significant ability to extract the blood vessel structure. However, it remains challenging to refined segmentation for the capillaries and the edges of retinal vessels due to thickness inconsistencies and blurry boundaries. In this paper, we propose a novel deep neu&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2202.09515v1-abstract-full').style.display = 'inline'; document.getElementById('2202.09515v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2202.09515v1-abstract-full" style="display: none;"> Segmentation of retinal vessel images is critical to the diagnosis of retinopathy. Recently, convolutional neural networks have shown significant ability to extract the blood vessel structure. However, it remains challenging to refined segmentation for the capillaries and the edges of retinal vessels due to thickness inconsistencies and blurry boundaries. In this paper, we propose a novel deep neural network for retinal vessel segmentation based on shared decoder and pyramid-like loss (SPNet) to address the above problems. Specifically, we introduce a decoder-sharing mechanism to capture multi-scale semantic information, where feature maps at diverse scales are decoded through a sequence of weight-sharing decoder modules. Also, to strengthen characterization on the capillaries and the edges of blood vessels, we define a residual pyramid architecture which decomposes the spatial information in the decoding phase. A pyramid-like loss function is designed to compensate possible segmentation errors progressively. Experimental results on public benchmarks show that the proposed method outperforms the backbone network and the state-of-the-art methods, especially in the regions of the capillaries and the vessel contours. In addition, performances on cross-datasets verify that SPNet shows stronger generalization ability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2202.09515v1-abstract-full').style.display = 'none'; document.getElementById('2202.09515v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2112.01770">arXiv:2112.01770</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2112.01770">pdf</a>, <a href="https://arxiv.org/format/2112.01770">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> THz Band Channel Measurements and Statistical Modeling for Urban Microcellular Environments </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Abbasi%2C+N+A">Naveed A. Abbasi</a>, <a href="/search/eess?searchtype=author&amp;query=Gomez-Ponce%2C+J">Jorge Gomez-Ponce</a>, <a href="/search/eess?searchtype=author&amp;query=Kondaveti%2C+R">Revanth Kondaveti</a>, <a href="/search/eess?searchtype=author&amp;query=Kumar%2C+A">Ashish Kumar</a>, <a href="/search/eess?searchtype=author&amp;query=Bhagat%2C+E">Eshan Bhagat</a>, <a href="/search/eess?searchtype=author&amp;query=Rao%2C+R+N+S">Rakesh N S Rao</a>, <a href="/search/eess?searchtype=author&amp;query=Abu-Surra%2C+S">Shadi Abu-Surra</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+G">Gary Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+C">Charlie Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Molisch%2C+A+F">Andreas F. Molisch</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2112.01770v1-abstract-short" style="display: inline;"> The THz band (0.1-10 THz) has attracted considerable attention for next-generation wireless communications, due to the large amount of available bandwidth that may be key to meet the rapidly increasing data rate requirements. Before deploying a system in this band, a detailed wireless channel analysis is required as the basis for proper design and testing of system implementations. One of the most&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2112.01770v1-abstract-full').style.display = 'inline'; document.getElementById('2112.01770v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2112.01770v1-abstract-full" style="display: none;"> The THz band (0.1-10 THz) has attracted considerable attention for next-generation wireless communications, due to the large amount of available bandwidth that may be key to meet the rapidly increasing data rate requirements. Before deploying a system in this band, a detailed wireless channel analysis is required as the basis for proper design and testing of system implementations. One of the most important deployment scenarios of this band is the outdoor microcellular environment, where the Transmitter (Tx) and the Receiver (Rx) have a significant height difference (typically $ \ge 10$ m). In this paper, we present double-directional (i.e., directionally resolved at both link ends) channel measurements in such a microcellular scenario encompassing street canyons and an open square. Measurements are done for a 1 GHz bandwidth between 145-146 GHz and an antenna beamwidth of 13 degree; distances between Tx and Rx are up to 85 m and the Tx is at a height of 11.5 m from the ground. The measurements are analyzed to estimate path loss, shadowing, delay spread, angular spread, and multipath component (MPC) power distribution. These results allow the development of more realistic and detailed THz channel models and system performance assessment. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2112.01770v1-abstract-full').style.display = 'none'; document.getElementById('2112.01770v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 December, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2111.10372">arXiv:2111.10372</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2111.10372">pdf</a>, <a href="https://arxiv.org/format/2111.10372">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Resistance-Time Co-Modulated PointNet for Temporal Super-Resolution Simulation of Blood Vessel Flows </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Jiang%2C+Z">Zhizheng Jiang</a>, <a href="/search/eess?searchtype=author&amp;query=Gao%2C+F">Fei Gao</a>, <a href="/search/eess?searchtype=author&amp;query=Gu%2C+R">Renshu Gu</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+J">Jinlan Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+G">Gang Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Rabczuk%2C+T">Timon Rabczuk</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2111.10372v1-abstract-short" style="display: inline;"> In this paper, a novel deep learning framework is proposed for temporal super-resolution simulation of blood vessel flows, in which a high-temporal-resolution time-varying blood vessel flow simulation is generated from a low-temporal-resolution flow simulation result. In our framework, point-cloud is used to represent the complex blood vessel model, resistance-time aided PointNet model is proposed&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2111.10372v1-abstract-full').style.display = 'inline'; document.getElementById('2111.10372v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2111.10372v1-abstract-full" style="display: none;"> In this paper, a novel deep learning framework is proposed for temporal super-resolution simulation of blood vessel flows, in which a high-temporal-resolution time-varying blood vessel flow simulation is generated from a low-temporal-resolution flow simulation result. In our framework, point-cloud is used to represent the complex blood vessel model, resistance-time aided PointNet model is proposed for extracting the time-space features of the time-varying flow field, and finally we can reconstruct the high-accuracy and high-resolution flow field through the Decoder module. In particular, the amplitude loss and the orientation loss of the velocity are proposed from the vector characteristics of the velocity. And the combination of these two metrics constitutes the final loss function for network training. Several examples are given to illustrate the effective and efficiency of the proposed framework for temporal super-resolution simulation of blood vessel flows. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2111.10372v1-abstract-full').style.display = 'none'; document.getElementById('2111.10372v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2110.13000">arXiv:2110.13000</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2110.13000">pdf</a>, <a href="https://arxiv.org/format/2110.13000">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optics">physics.optics</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/APS/URSI47566.2021.9704271">10.1109/APS/URSI47566.2021.9704271 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Extreme Beam-forming with Metagrating-assisted Planar Antennas </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Xu%2C+G">Gengyu Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Hum%2C+S+V">Sean V. Hum</a>, <a href="/search/eess?searchtype=author&amp;query=Eleftheriades%2C+G+V">George V. Eleftheriades</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2110.13000v2-abstract-short" style="display: inline;"> We present a highly efficient metagrating-assisted antenna (MGA) architecture with a simple integrated feed. Power from a localized line source is distributed throughout the arbitrarily large antenna aperture with the help of a passive and lossless electromagnetic metagrating (MG). With appropriately designed meta-wire loading, the omnidirectional source field can be efficiently transformed into d&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.13000v2-abstract-full').style.display = 'inline'; document.getElementById('2110.13000v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2110.13000v2-abstract-full" style="display: none;"> We present a highly efficient metagrating-assisted antenna (MGA) architecture with a simple integrated feed. Power from a localized line source is distributed throughout the arbitrarily large antenna aperture with the help of a passive and lossless electromagnetic metagrating (MG). With appropriately designed meta-wire loading, the omnidirectional source field can be efficiently transformed into directive radiation. To aid the design process, a 2-dimensional volume-surface integral equation framework which accurately predicts the radiation pattern of the MGA is developed. Through constrained optimization, the directivity of the MGA in the desired direction is maximized. In this way, extreme-angle beam steering is demonstrated. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.13000v2-abstract-full').style.display = 'none'; document.getElementById('2110.13000v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 December, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 October, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2109.13693">arXiv:2109.13693</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2109.13693">pdf</a>, <a href="https://arxiv.org/format/2109.13693">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> THz Band Channel Measurements and Statistical Modeling for Urban D2D Environments </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Abbasi%2C+N+A">Naveed A. Abbasi</a>, <a href="/search/eess?searchtype=author&amp;query=Gomez-Ponce%2C+J">Jorge Gomez-Ponce</a>, <a href="/search/eess?searchtype=author&amp;query=Kondaveti%2C+R">Revanth Kondaveti</a>, <a href="/search/eess?searchtype=author&amp;query=Shaikbepari%2C+S+M">Shahid M. Shaikbepari</a>, <a href="/search/eess?searchtype=author&amp;query=Rao%2C+S">Shreyas Rao</a>, <a href="/search/eess?searchtype=author&amp;query=Abu-Surra%2C+S">Shadi Abu-Surra</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+G">Gary Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+C">Charlie Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Molisch%2C+A+F">Andreas F. Molisch</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2109.13693v1-abstract-short" style="display: inline;"> THz band is envisioned to be used in 6G systems to meet the ever-increasing demand for data rate. However, before an eventual system design and deployment can proceed, detailed channel sounding measurements are required to understand key channel characteristics. In this paper, we present a first extensive set of channel measurements for urban outdoor environments that are ultra-wideband (1 GHz 3dB&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2109.13693v1-abstract-full').style.display = 'inline'; document.getElementById('2109.13693v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2109.13693v1-abstract-full" style="display: none;"> THz band is envisioned to be used in 6G systems to meet the ever-increasing demand for data rate. However, before an eventual system design and deployment can proceed, detailed channel sounding measurements are required to understand key channel characteristics. In this paper, we present a first extensive set of channel measurements for urban outdoor environments that are ultra-wideband (1 GHz 3dB bandwidth), and double-directional where both the transmitter and receiver are at the same height. In all, we present measurements at 38 Tx/Rx location pairs, consisting of a total of nearly 50,000 impulse responses, at both line-of-sight (LoS) and non-line-of-sight (NLoS) cases in the 1-100 m range. We provide modeling for path loss, shadowing, delay spread, angular spread and multipath component (MPC) power distribution. We find, among other things, that outdoor communication over tens of meters is feasible in this frequency range even in NLoS scenarios, that omni-directional delay spreads of up to 100 ns, and directional delay spreads of up to 10 ns are observed, while angular spreads are also quite significant, and a surprisingly large number of MPCs are observed for 1 GHz bandwidth and 13 degree beamwidth. These results constitute an important first step towards better understanding the wireless channel in the THz band. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2109.13693v1-abstract-full').style.display = 'none'; document.getElementById('2109.13693v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 September, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2109.03478">arXiv:2109.03478</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2109.03478">pdf</a>, <a href="https://arxiv.org/format/2109.03478">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TMI.2021.3104474">10.1109/TMI.2021.3104474 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Cross-Site Severity Assessment of COVID-19 from CT Images via Domain Adaptation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Xu%2C+G">Geng-Xin Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+C">Chen Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+J">Jun Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Ding%2C+Z">Zhongxiang Ding</a>, <a href="/search/eess?searchtype=author&amp;query=Shi%2C+F">Feng Shi</a>, <a href="/search/eess?searchtype=author&amp;query=Guo%2C+M">Man Guo</a>, <a href="/search/eess?searchtype=author&amp;query=Zhao%2C+W">Wei Zhao</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+X">Xiaoming Li</a>, <a href="/search/eess?searchtype=author&amp;query=Wei%2C+Y">Ying Wei</a>, <a href="/search/eess?searchtype=author&amp;query=Gao%2C+Y">Yaozong Gao</a>, <a href="/search/eess?searchtype=author&amp;query=Ren%2C+C">Chuan-Xian Ren</a>, <a href="/search/eess?searchtype=author&amp;query=Shen%2C+D">Dinggang Shen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2109.03478v1-abstract-short" style="display: inline;"> Early and accurate severity assessment of Coronavirus disease 2019 (COVID-19) based on computed tomography (CT) images offers a great help to the estimation of intensive care unit event and the clinical decision of treatment planning. To augment the labeled data and improve the generalization ability of the classification model, it is necessary to aggregate data from multiple sites. This task face&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2109.03478v1-abstract-full').style.display = 'inline'; document.getElementById('2109.03478v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2109.03478v1-abstract-full" style="display: none;"> Early and accurate severity assessment of Coronavirus disease 2019 (COVID-19) based on computed tomography (CT) images offers a great help to the estimation of intensive care unit event and the clinical decision of treatment planning. To augment the labeled data and improve the generalization ability of the classification model, it is necessary to aggregate data from multiple sites. This task faces several challenges including class imbalance between mild and severe infections, domain distribution discrepancy between sites, and presence of heterogeneous features. In this paper, we propose a novel domain adaptation (DA) method with two components to address these problems. The first component is a stochastic class-balanced boosting sampling strategy that overcomes the imbalanced learning problem and improves the classification performance on poorly-predicted classes. The second component is a representation learning that guarantees three properties: 1) domain-transferability by prototype triplet loss, 2) discriminant by conditional maximum mean discrepancy loss, and 3) completeness by multi-view reconstruction loss. Particularly, we propose a domain translator and align the heterogeneous data to the estimated class prototypes (i.e., class centers) in a hyper-sphere manifold. Experiments on cross-site severity assessment of COVID-19 from CT images show that the proposed method can effectively tackle the imbalanced learning problem and outperform recent DA approaches. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2109.03478v1-abstract-full').style.display = 'none'; document.getElementById('2109.03478v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 September, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2109.01235">arXiv:2109.01235</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2109.01235">pdf</a>, <a href="https://arxiv.org/format/2109.01235">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> DeepTracks: Geopositioning Maritime Vehicles in Video Acquired from a Moving Platform </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Wei%2C+J">Jianli Wei</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+G">Guanyu Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Yilmaz%2C+A">Alper Yilmaz</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2109.01235v1-abstract-short" style="display: inline;"> Geopositioning and tracking a moving boat at sea is a very challenging problem, requiring boat detection, matching and estimating its GPS location from imagery with no common features. The problem can be stated as follows: given imagery from a camera mounted on a moving platform with known GPS location as the only valid sensor, we predict the geoposition of a target boat visible in images. Our sol&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2109.01235v1-abstract-full').style.display = 'inline'; document.getElementById('2109.01235v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2109.01235v1-abstract-full" style="display: none;"> Geopositioning and tracking a moving boat at sea is a very challenging problem, requiring boat detection, matching and estimating its GPS location from imagery with no common features. The problem can be stated as follows: given imagery from a camera mounted on a moving platform with known GPS location as the only valid sensor, we predict the geoposition of a target boat visible in images. Our solution uses recent ML algorithms, the camera-scene geometry and Bayesian filtering. The proposed pipeline first detects and tracks the target boat&#39;s location in the image with the strategy of tracking by detection. This image location is then converted to geoposition to the local sea coordinates referenced to the camera GPS location using plane projective geometry. Finally, target boat local coordinates are transformed to global GPS coordinates to estimate the geoposition. To achieve a smooth geotrajectory, we apply unscented Kalman filter (UKF) which implicitly overcomes small detection errors in the early stages of the pipeline. We tested the performance of our approach using GPS ground truth and show the accuracy and speed of the estimated geopositions. Our code is publicly available at https://github.com/JianliWei1995/AI-Track-at-Sea. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2109.01235v1-abstract-full').style.display = 'none'; document.getElementById('2109.01235v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 September, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2107.07988">arXiv:2107.07988</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2107.07988">pdf</a>, <a href="https://arxiv.org/format/2107.07988">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Controlled AutoEncoders to Generate Faces from Voices </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Liang%2C+H">Hao Liang</a>, <a href="/search/eess?searchtype=author&amp;query=Yu%2C+L">Lulan Yu</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+G">Guikang Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Raj%2C+B">Bhiksha Raj</a>, <a href="/search/eess?searchtype=author&amp;query=Singh%2C+R">Rita Singh</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2107.07988v1-abstract-short" style="display: inline;"> Multiple studies in the past have shown that there is a strong correlation between human vocal characteristics and facial features. However, existing approaches generate faces simply from voice, without exploring the set of features that contribute to these observed correlations. A computational methodology to explore this can be devised by rephrasing the question to: &#34;how much would a target face&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2107.07988v1-abstract-full').style.display = 'inline'; document.getElementById('2107.07988v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2107.07988v1-abstract-full" style="display: none;"> Multiple studies in the past have shown that there is a strong correlation between human vocal characteristics and facial features. However, existing approaches generate faces simply from voice, without exploring the set of features that contribute to these observed correlations. A computational methodology to explore this can be devised by rephrasing the question to: &#34;how much would a target face have to change in order to be perceived as the originator of a source voice?&#34; With this in perspective, we propose a framework to morph a target face in response to a given voice in a way that facial features are implicitly guided by learned voice-face correlation in this paper. Our framework includes a guided autoencoder that converts one face to another, controlled by a unique model-conditioning component called a gating controller which modifies the reconstructed face based on input voice recordings. We evaluate the framework on VoxCelab and VGGFace datasets through human subjects and face retrieval. Various experiments demonstrate the effectiveness of our proposed model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2107.07988v1-abstract-full').style.display = 'none'; document.getElementById('2107.07988v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 July, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2107.02345">arXiv:2107.02345</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2107.02345">pdf</a>, <a href="https://arxiv.org/format/2107.02345">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Domain Adaptation via CycleGAN for Retina Segmentation in Optical Coherence Tomography </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Chen%2C+R">Ricky Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Yu%2C+T+T">Timothy T. Yu</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+G">Gavin Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Ma%2C+D">Da Ma</a>, <a href="/search/eess?searchtype=author&amp;query=Sarunic%2C+M+V">Marinko V. Sarunic</a>, <a href="/search/eess?searchtype=author&amp;query=Beg%2C+M+F">Mirza Faisal Beg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2107.02345v1-abstract-short" style="display: inline;"> With the FDA approval of Artificial Intelligence (AI) for point-of-care clinical diagnoses, model generalizability is of the utmost importance as clinical decision-making must be domain-agnostic. A method of tackling the problem is to increase the dataset to include images from a multitude of domains; while this technique is ideal, the security requirements of medical data is a major limitation. A&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2107.02345v1-abstract-full').style.display = 'inline'; document.getElementById('2107.02345v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2107.02345v1-abstract-full" style="display: none;"> With the FDA approval of Artificial Intelligence (AI) for point-of-care clinical diagnoses, model generalizability is of the utmost importance as clinical decision-making must be domain-agnostic. A method of tackling the problem is to increase the dataset to include images from a multitude of domains; while this technique is ideal, the security requirements of medical data is a major limitation. Additionally, researchers with developed tools benefit from the addition of open-sourced data, but are limited by the difference in domains. Herewith, we investigated the implementation of a Cycle-Consistent Generative Adversarial Networks (CycleGAN) for the domain adaptation of Optical Coherence Tomography (OCT) volumes. This study was done in collaboration with the Biomedical Optics Research Group and Functional &amp; Anatomical Imaging &amp; Shape Analysis Lab at Simon Fraser University. In this study, we investigated a learning-based approach of adapting the domain of a publicly available dataset, UK Biobank dataset (UKB). To evaluate the performance of domain adaptation, we utilized pre-existing retinal layer segmentation tools developed on a different set of RETOUCH OCT data. This study provides insight on state-of-the-art tools for domain adaptation compared to traditional processing techniques as well as a pipeline for adapting publicly available retinal data to the domains previously used by our collaborators. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2107.02345v1-abstract-full').style.display = 'none'; document.getElementById('2107.02345v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 July, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 6 figures, 1 table</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.4.0 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2106.07564">arXiv:2106.07564</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2106.07564">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> An optimized Capsule-LSTM model for facial expression recognition with video sequences </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Liu%2C+S">Siwei Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Long%2C+Y">Yuanpeng Long</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+G">Gao Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Yang%2C+L">Lijia Yang</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+S">Shimei Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Yao%2C+X">Xiaoming Yao</a>, <a href="/search/eess?searchtype=author&amp;query=Shu%2C+K">Kunxian Shu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2106.07564v1-abstract-short" style="display: inline;"> To overcome the limitations of convolutional neural network in the process of facial expression recognition, a facial expression recognition model Capsule-LSTM based on video frame sequence is proposed. This model is composed of three networks includingcapsule encoders, capsule decoders and LSTM network. The capsule encoder extracts the spatial information of facial expressions in video frames. Ca&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2106.07564v1-abstract-full').style.display = 'inline'; document.getElementById('2106.07564v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2106.07564v1-abstract-full" style="display: none;"> To overcome the limitations of convolutional neural network in the process of facial expression recognition, a facial expression recognition model Capsule-LSTM based on video frame sequence is proposed. This model is composed of three networks includingcapsule encoders, capsule decoders and LSTM network. The capsule encoder extracts the spatial information of facial expressions in video frames. Capsule decoder reconstructs the images to optimize the network. LSTM extracts the temporal information between video frames and analyzes the differences in expression changes between frames. The experimental results from the MMI dataset show that the Capsule-LSTM model proposed in this paper can effectively improve the accuracy of video expression recognition. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2106.07564v1-abstract-full').style.display = 'none'; document.getElementById('2106.07564v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 May, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14pages,4 figurews</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2106.07563">arXiv:2106.07563</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2106.07563">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> BPLF: A Bi-Parallel Linear Flow Model for Facial Expression Generation from Emotion Set Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Xu%2C+G">Gao Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Long%2C+Y">Yuanpeng Long</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+S">Siwei Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Yang%2C+L">Lijia Yang</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+S">Shimei Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Yao%2C+X">Xiaoming Yao</a>, <a href="/search/eess?searchtype=author&amp;query=Shu%2C+K">Kunxian Shu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2106.07563v1-abstract-short" style="display: inline;"> The flow-based generative model is a deep learning generative model, which obtains the ability to generate data by explicitly learning the data distribution. Theoretically its ability to restore data is stronger than other generative models. However, its implementation has many limitations, including limited model design, too many model parameters and tedious calculation. In this paper, a bi-paral&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2106.07563v1-abstract-full').style.display = 'inline'; document.getElementById('2106.07563v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2106.07563v1-abstract-full" style="display: none;"> The flow-based generative model is a deep learning generative model, which obtains the ability to generate data by explicitly learning the data distribution. Theoretically its ability to restore data is stronger than other generative models. However, its implementation has many limitations, including limited model design, too many model parameters and tedious calculation. In this paper, a bi-parallel linear flow model for facial emotion generation from emotion set images is constructed, and a series of improvements have been made in terms of the expression ability of the model and the convergence speed in training. The model is mainly composed of several coupling layers superimposed to form a multi-scale structure, in which each coupling layer contains 1*1 reversible convolution and linear operation modules. Furthermore, this paper sorted out the current public data set of facial emotion images, made a new emotion data, and verified the model through this data set. The experimental results show that, under the traditional convolutional neural network, the 3-layer 3*3 convolution kernel is more conducive to extracte the features of the face images. The introduction of principal component decomposition can improve the convergence speed of the model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2106.07563v1-abstract-full').style.display = 'none'; document.getElementById('2106.07563v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 May, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">20 pages, 10 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2012.03673">arXiv:2012.03673</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2012.03673">pdf</a>, <a href="https://arxiv.org/format/2012.03673">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Efficient Medical Image Segmentation with Intermediate Supervision Mechanism </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Yuan%2C+D">Di Yuan</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+J">Junyang Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+Z">Zhenghua Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Lukasiewicz%2C+T">Thomas Lukasiewicz</a>, <a href="/search/eess?searchtype=author&amp;query=Fu%2C+Z">Zhigang Fu</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+G">Guizhi Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2012.03673v1-abstract-short" style="display: inline;"> Because the expansion path of U-Net may ignore the characteristics of small targets, intermediate supervision mechanism is proposed. The original mask is also entered into the network as a label for intermediate output. However, U-Net is mainly engaged in segmentation, and the extracted features are also targeted at segmentation location information, and the input and output are different. The lab&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2012.03673v1-abstract-full').style.display = 'inline'; document.getElementById('2012.03673v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2012.03673v1-abstract-full" style="display: none;"> Because the expansion path of U-Net may ignore the characteristics of small targets, intermediate supervision mechanism is proposed. The original mask is also entered into the network as a label for intermediate output. However, U-Net is mainly engaged in segmentation, and the extracted features are also targeted at segmentation location information, and the input and output are different. The label we need is that the input and output are both original masks, which is more similar to the refactoring process, so we propose another intermediate supervision mechanism. However, the features extracted by the contraction path of this intermediate monitoring mechanism are not necessarily consistent. For example, U-Net&#39;s contraction path extracts transverse features, while auto-encoder extracts longitudinal features, which may cause the output of the expansion path to be inconsistent with the label. Therefore, we put forward the intermediate supervision mechanism of shared-weight decoder module. Although the intermediate supervision mechanism improves the segmentation accuracy, the training time is too long due to the extra input and multiple loss functions. For one of these problems, we have introduced tied-weight decoder. To reduce the redundancy of the model, we combine shared-weight decoder module with tied-weight decoder module. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2012.03673v1-abstract-full').style.display = 'none'; document.getElementById('2012.03673v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2020. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2011.08706">arXiv:2011.08706</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2011.08706">pdf</a>, <a href="https://arxiv.org/format/2011.08706">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> FPAENet: Pneumonia Detection Network Based on Feature Pyramid Attention Enhancement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+X">Xudong Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+B">Bo Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Yuan%2C+D">Di Yuan</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+Z">Zhenghua Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+G">Guizhi Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2011.08706v1-abstract-short" style="display: inline;"> Automatic pneumonia Detection based on deep learning has increasing clinical value. Although the existing Feature Pyramid Network (FPN) and its variants have already achieved some great successes, their detection accuracies for pneumonia lesions in medical images are still unsatisfactory. In this paper, we propose a pneumonia detection network based on feature pyramid attention enhancement, which&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2011.08706v1-abstract-full').style.display = 'inline'; document.getElementById('2011.08706v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2011.08706v1-abstract-full" style="display: none;"> Automatic pneumonia Detection based on deep learning has increasing clinical value. Although the existing Feature Pyramid Network (FPN) and its variants have already achieved some great successes, their detection accuracies for pneumonia lesions in medical images are still unsatisfactory. In this paper, we propose a pneumonia detection network based on feature pyramid attention enhancement, which integrates attended high-level semantic features with low-level information. We add another information extracting path equipped with feature enhancement modules, which are conducted with an attention mechanism. Experimental results show that our proposed method can achieve much better performances, as a higher value of 4.02% and 3.19%, than the baselines in detecting pneumonia lesions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2011.08706v1-abstract-full').style.display = 'none'; document.getElementById('2011.08706v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 November, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2020. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2011.07534">arXiv:2011.07534</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2011.07534">pdf</a>, <a href="https://arxiv.org/format/2011.07534">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SAG-GAN: Semi-Supervised Attention-Guided GANs for Data Augmentation on Medical Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Qi%2C+C">Chang Qi</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+J">Junyang Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+G">Guizhi Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+Z">Zhenghua Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Lukasiewicz%2C+T">Thomas Lukasiewicz</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+Y">Yang Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2011.07534v1-abstract-short" style="display: inline;"> Recently deep learning methods, in particular, convolutional neural networks (CNNs), have led to a massive breakthrough in the range of computer vision. Also, the large-scale annotated dataset is the essential key to a successful training procedure. However, it is a huge challenge to get such datasets in the medical domain. Towards this, we present a data augmentation method for generating synthet&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2011.07534v1-abstract-full').style.display = 'inline'; document.getElementById('2011.07534v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2011.07534v1-abstract-full" style="display: none;"> Recently deep learning methods, in particular, convolutional neural networks (CNNs), have led to a massive breakthrough in the range of computer vision. Also, the large-scale annotated dataset is the essential key to a successful training procedure. However, it is a huge challenge to get such datasets in the medical domain. Towards this, we present a data augmentation method for generating synthetic medical images using cycle-consistency Generative Adversarial Networks (GANs). We add semi-supervised attention modules to generate images with convincing details. We treat tumor images and normal images as two domains. The proposed GANs-based model can generate a tumor image from a normal image, and in turn, it can also generate a normal image from a tumor image. Furthermore, we show that generated medical images can be used for improving the performance of ResNet18 for medical image classification. Our model is applied to three limited datasets of tumor MRI images. We first generate MRI images on limited datasets, then we trained three popular classification models to get the best model for tumor classification. Finally, we train the classification model using real images with classic data augmentation methods and classification models using synthetic images. The classification results between those trained models showed that the proposed SAG-GAN data augmentation method can boost Accuracy and AUC compare with classic data augmentation methods. We believe the proposed data augmentation method can apply to other medical image domains, and improve the accuracy of computer-assisted diagnosis. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2011.07534v1-abstract-full').style.display = 'none'; document.getElementById('2011.07534v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2020. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2011.05161">arXiv:2011.05161</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2011.05161">pdf</a>, <a href="https://arxiv.org/format/2011.05161">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Improving Prosody Modelling with Cross-Utterance BERT Embeddings for End-to-end Speech Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Xu%2C+G">Guanghui Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Song%2C+W">Wei Song</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+Z">Zhengchen Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+C">Chao Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=He%2C+X">Xiaodong He</a>, <a href="/search/eess?searchtype=author&amp;query=Zhou%2C+B">Bowen Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2011.05161v1-abstract-short" style="display: inline;"> Despite prosody is related to the linguistic information up to the discourse structure, most text-to-speech (TTS) systems only take into account that within each sentence, which makes it challenging when converting a paragraph of texts into natural and expressive speech. In this paper, we propose to use the text embeddings of the neighboring sentences to improve the prosody generation for each utt&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2011.05161v1-abstract-full').style.display = 'inline'; document.getElementById('2011.05161v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2011.05161v1-abstract-full" style="display: none;"> Despite prosody is related to the linguistic information up to the discourse structure, most text-to-speech (TTS) systems only take into account that within each sentence, which makes it challenging when converting a paragraph of texts into natural and expressive speech. In this paper, we propose to use the text embeddings of the neighboring sentences to improve the prosody generation for each utterance of a paragraph in an end-to-end fashion without using any explicit prosody features. More specifically, cross-utterance (CU) context vectors, which are produced by an additional CU encoder based on the sentence embeddings extracted by a pre-trained BERT model, are used to augment the input of the Tacotron2 decoder. Two types of BERT embeddings are investigated, which leads to the use of different CU encoder structures. Experimental results on a Mandarin audiobook dataset and the LJ-Speech English audiobook dataset demonstrate the use of CU information can improve the naturalness and expressiveness of the synthesized speech. Subjective listening testing shows most of the participants prefer the voice generated using the CU encoder over that generated using standard Tacotron2. It is also found that the prosody can be controlled indirectly by changing the neighbouring sentences. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2011.05161v1-abstract-full').style.display = 'none'; document.getElementById('2011.05161v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2011.00940">arXiv:2011.00940</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2011.00940">pdf</a>, <a href="https://arxiv.org/format/2011.00940">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Deep Learning in Computer-Aided Diagnosis and Treatment of Tumors: A Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Zhao%2C+D">Dan Zhao</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+G">Guizhi Xu</a>, <a href="/search/eess?searchtype=author&amp;query=XU%2C+Z">Zhenghua XU</a>, <a href="/search/eess?searchtype=author&amp;query=Lukasiewicz%2C+T">Thomas Lukasiewicz</a>, <a href="/search/eess?searchtype=author&amp;query=Xue%2C+M">Minmin Xue</a>, <a href="/search/eess?searchtype=author&amp;query=Fu%2C+Z">Zhigang Fu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2011.00940v1-abstract-short" style="display: inline;"> Computer-Aided Diagnosis and Treatment of Tumors is a hot topic of deep learning in recent years, which constitutes a series of medical tasks, such as detection of tumor markers, the outline of tumor leisures, subtypes and stages of tumors, prediction of therapeutic effect, and drug development. Meanwhile, there are some deep learning models with precise positioning and excellent performance produ&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2011.00940v1-abstract-full').style.display = 'inline'; document.getElementById('2011.00940v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2011.00940v1-abstract-full" style="display: none;"> Computer-Aided Diagnosis and Treatment of Tumors is a hot topic of deep learning in recent years, which constitutes a series of medical tasks, such as detection of tumor markers, the outline of tumor leisures, subtypes and stages of tumors, prediction of therapeutic effect, and drug development. Meanwhile, there are some deep learning models with precise positioning and excellent performance produced in mainstream task scenarios. Thus follow to introduce deep learning methods from task-orient, mainly focus on the improvements for medical tasks. Then to summarize the recent progress in four stages of tumor diagnosis and treatment, which named In-Vitro Diagnosis (IVD), Imaging Diagnosis (ID), Pathological Diagnosis (PD), and Treatment Planning (TP). According to the specific data types and medical tasks of each stage, we present the applications of deep learning in the Computer-Aided Diagnosis and Treatment of Tumors and analyzing the excellent works therein. This survey concludes by discussing research issues and suggesting challenges for future improvement. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2011.00940v1-abstract-full').style.display = 'none'; document.getElementById('2011.00940v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 November, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2020. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2010.00472">arXiv:2010.00472</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2010.00472">pdf</a>, <a href="https://arxiv.org/format/2010.00472">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/IGARSS.2018.8518855">10.1109/IGARSS.2018.8518855 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> High Quality Remote Sensing Image Super-Resolution Using Deep Memory Connected Network </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Xu%2C+W">Wenjia Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+G">Guangluan Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+Y">Yang Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Sun%2C+X">Xian Sun</a>, <a href="/search/eess?searchtype=author&amp;query=Lin%2C+D">Daoyu Lin</a>, <a href="/search/eess?searchtype=author&amp;query=Wu%2C+Y">Yirong Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2010.00472v1-abstract-short" style="display: inline;"> Single image super-resolution is an effective way to enhance the spatial resolution of remote sensing image, which is crucial for many applications such as target detection and image classification. However, existing methods based on the neural network usually have small receptive fields and ignore the image detail. We propose a novel method named deep memory connected network (DMCN) based on a co&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2010.00472v1-abstract-full').style.display = 'inline'; document.getElementById('2010.00472v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2010.00472v1-abstract-full" style="display: none;"> Single image super-resolution is an effective way to enhance the spatial resolution of remote sensing image, which is crucial for many applications such as target detection and image classification. However, existing methods based on the neural network usually have small receptive fields and ignore the image detail. We propose a novel method named deep memory connected network (DMCN) based on a convolutional neural network to reconstruct high-quality super-resolution images. We build local and global memory connections to combine image detail with environmental information. To further reduce parameters and ease time-consuming, we propose downsampling units, shrinking the spatial size of feature maps. We test DMCN on three remote sensing datasets with different spatial resolution. Experimental results indicate that our method yields promising improvements in both accuracy and visual performance over the current state-of-the-art. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2010.00472v1-abstract-full').style.display = 'none'; document.getElementById('2010.00472v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 October, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">IGARSS 2018 - 2018 IEEE International Geoscience and Remote Sensing Symposium</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2001.05021">arXiv:2001.05021</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2001.05021">pdf</a>, <a href="https://arxiv.org/format/2001.05021">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Beam-space Multiplexing: Practice, Theory, and Trends-From 4G TD-LTE, 5G, to 6G and Beyond </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Chen%2C+S">Shanzhi Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Sun%2C+S">Shaohui Sun</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+G">Guixian Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Su%2C+X">Xin Su</a>, <a href="/search/eess?searchtype=author&amp;query=Cai%2C+Y">Yuemin Cai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2001.05021v1-abstract-short" style="display: inline;"> In this article, the new term, namely beam-space multiplexing, is proposed for the former multi-layer beamforming for 4G TD-LTE in 3GPP releases. We provide a systematic overview of beam-space multiplexing from engineering and theoretical perspectives. Firstly, we clarify the fundamental theory of beam-space multiplexing. Specifically, we provide a comprehensive comparison with the antenna-space m&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2001.05021v1-abstract-full').style.display = 'inline'; document.getElementById('2001.05021v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2001.05021v1-abstract-full" style="display: none;"> In this article, the new term, namely beam-space multiplexing, is proposed for the former multi-layer beamforming for 4G TD-LTE in 3GPP releases. We provide a systematic overview of beam-space multiplexing from engineering and theoretical perspectives. Firstly, we clarify the fundamental theory of beam-space multiplexing. Specifically, we provide a comprehensive comparison with the antenna-space multiplexing in terms of theoretical analysis, channel state information acquisition, and engineering implementation constraints. Then, we summarize the key technologies and 3GPP standardization of beam-space multiplexing in 4G TD-LTE and 5G new radio (NR) in terms of multi-layer beamforming and massive beamforming, respectively. We also provide system-level performance evaluation of beam-space multiplexing schemes and field results from current commercial TD-LTE networks and field trial of 5G. The practical deployments of 4G TD-LTE and 5G cellular networks demonstrate the superiority of beam-space multiplexing within the limitations of implementation complexity and practical deployment scenarios. Finally, the future trends of beam-space multiplexing in 6G and beyond are discussed, including massive beamforming for extremely large-scale MIMO (XL-MIMO), low earth orbit (LEO) satellites communication, data-driven intelligent massive beamforming, and multi-target spatial signal processing, i.e., joint communication and sensing, positioning, etc. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2001.05021v1-abstract-full').style.display = 'none'; document.getElementById('2001.05021v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 January, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">to appear on ieee wireless communications</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2001.00769">arXiv:2001.00769</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2001.00769">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/MGRS.2019.2955120">10.1109/MGRS.2019.2955120 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> InSAR Phase Denoising: A Review of Current Technologies and Future Directions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Xu%2C+G">Gang Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Gao%2C+Y">Yandong Gao</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+J">Jinwei Li</a>, <a href="/search/eess?searchtype=author&amp;query=Xing%2C+M">Mengdao Xing</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2001.00769v2-abstract-short" style="display: inline;"> Nowadays, interferometric synthetic aperture radar (InSAR) has been a powerful tool in remote sensing by enhancing the information acquisition. During the InSAR processing, phase denoising of interferogram is a mandatory step for topography mapping and deformation monitoring. Over the last three decades, a large number of effective algorithms have been developed to do efforts on this topic. In thi&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2001.00769v2-abstract-full').style.display = 'inline'; document.getElementById('2001.00769v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2001.00769v2-abstract-full" style="display: none;"> Nowadays, interferometric synthetic aperture radar (InSAR) has been a powerful tool in remote sensing by enhancing the information acquisition. During the InSAR processing, phase denoising of interferogram is a mandatory step for topography mapping and deformation monitoring. Over the last three decades, a large number of effective algorithms have been developed to do efforts on this topic. In this paper, we give a comprehensive overview of InSAR phase denoising methods, classifying the established and emerging algorithms into four main categories. The first two parts refer to the categories of traditional local filters and transformed-domain filters, respectively. The third part focuses on the category of nonlocal (NL) filters, considering their outstanding performances. Latter, some advanced methods based on new concept of signal processing are also introduced to show their potentials in this field. Moreover, several popular phase denoising methods are illustrated and compared by performing the numerical experiments using both simulated and measured data. The purpose of this paper is intended to provide necessary guideline and inspiration to related researchers by promoting the architecture development of InSAR signal processing. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2001.00769v2-abstract-full').style.display = 'none'; document.getElementById('2001.00769v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 December, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 January, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">G. Xu, Y. Gao, J. Li and M. Xing, &#34;InSAR Phase Denoising: A Review of Current Technologies and Future Directions,&#34; IEEE Geoscience and Remote Sensing Magazine, vol. 8, no. 2, pp. 64-82, June 2020. DOI 10.1109/MGRS.2019.2955120</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1911.10477">arXiv:1911.10477</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1911.10477">pdf</a>, <a href="https://arxiv.org/format/1911.10477">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/JBHI.2021.3049452">10.1109/JBHI.2021.3049452 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Reinventing 2D Convolutions for 3D Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Yang%2C+J">Jiancheng Yang</a>, <a href="/search/eess?searchtype=author&amp;query=Huang%2C+X">Xiaoyang Huang</a>, <a href="/search/eess?searchtype=author&amp;query=He%2C+Y">Yi He</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+J">Jingwei Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Yang%2C+C">Canqian Yang</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+G">Guozheng Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Ni%2C+B">Bingbing Ni</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1911.10477v4-abstract-short" style="display: inline;"> There have been considerable debates over 2D and 3D representation learning on 3D medical images. 2D approaches could benefit from large-scale 2D pretraining, whereas they are generally weak in capturing large 3D contexts. 3D approaches are natively strong in 3D contexts, however few publicly available 3D medical dataset is large and diverse enough for universal 3D pretraining. Even for hybrid (2D&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1911.10477v4-abstract-full').style.display = 'inline'; document.getElementById('1911.10477v4-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1911.10477v4-abstract-full" style="display: none;"> There have been considerable debates over 2D and 3D representation learning on 3D medical images. 2D approaches could benefit from large-scale 2D pretraining, whereas they are generally weak in capturing large 3D contexts. 3D approaches are natively strong in 3D contexts, however few publicly available 3D medical dataset is large and diverse enough for universal 3D pretraining. Even for hybrid (2D + 3D) approaches, the intrinsic disadvantages within the 2D / 3D parts still exist. In this study, we bridge the gap between 2D and 3D convolutions by reinventing the 2D convolutions. We propose ACS (axial-coronal-sagittal) convolutions to perform natively 3D representation learning, while utilizing the pretrained weights on 2D datasets. In ACS convolutions, 2D convolution kernels are split by channel into three parts, and convoluted separately on the three views (axial, coronal and sagittal) of 3D representations. Theoretically, ANY 2D CNN (ResNet, DenseNet, or DeepLab) is able to be converted into a 3D ACS CNN, with pretrained weight of a same parameter size. Extensive experiments on several medical benchmarks (including classification, segmentation and detection tasks) validate the consistent superiority of the pretrained ACS CNNs, over the 2D / 3D CNN counterparts with / without pretraining. Even without pretraining, the ACS convolution can be used as a plug-and-play replacement of standard 3D convolution, with smaller model size and less computation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1911.10477v4-abstract-full').style.display = 'none'; document.getElementById('1911.10477v4-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 January, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 November, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">IEEE Journal of Biomedical and Health Informatics (IEEE JBHI). Code is available at https://github.com/m3dv/ACSConv</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> IEEE Journal of Biomedical and Health Informatics (IEEE JBHI), 2021 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1909.00700">arXiv:1909.00700</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1909.00700">pdf</a>, <a href="https://arxiv.org/format/1909.00700">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Training-Time-Friendly Network for Real-Time Object Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Liu%2C+Z">Zili Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Zheng%2C+T">Tu Zheng</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+G">Guodong Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Yang%2C+Z">Zheng Yang</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+H">Haifeng Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Cai%2C+D">Deng Cai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1909.00700v3-abstract-short" style="display: inline;"> Modern object detectors can rarely achieve short training time, fast inference speed, and high accuracy at the same time. To strike a balance among them, we propose the Training-Time-Friendly Network (TTFNet). In this work, we start with light-head, single-stage, and anchor-free designs, which enable fast inference speed. Then, we focus on shortening training time. We notice that encoding more tra&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1909.00700v3-abstract-full').style.display = 'inline'; document.getElementById('1909.00700v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1909.00700v3-abstract-full" style="display: none;"> Modern object detectors can rarely achieve short training time, fast inference speed, and high accuracy at the same time. To strike a balance among them, we propose the Training-Time-Friendly Network (TTFNet). In this work, we start with light-head, single-stage, and anchor-free designs, which enable fast inference speed. Then, we focus on shortening training time. We notice that encoding more training samples from annotated boxes plays a similar role as increasing batch size, which helps enlarge the learning rate and accelerate the training process. To this end, we introduce a novel approach using Gaussian kernels to encode training samples. Besides, we design the initiative sample weights for better information utilization. Experiments on MS COCO show that our TTFNet has great advantages in balancing training time, inference speed, and accuracy. It has reduced training time by more than seven times compared to previous real-time detectors while maintaining state-of-the-art performances. In addition, our super-fast version of TTFNet-18 and TTFNet-53 can outperform SSD300 and YOLOv3 by less than one-tenth of their training time, respectively. The code has been made available at \url{https://github.com/ZJULearning/ttfnet}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1909.00700v3-abstract-full').style.display = 'none'; document.getElementById('1909.00700v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 November, 2019; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 September, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to AAAI2020 (8 pages, 3 figures)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1908.10555">arXiv:1908.10555</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1908.10555">pdf</a>, <a href="https://arxiv.org/format/1908.10555">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> CAMEL: A Weakly Supervised Learning Framework for Histopathology Image Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Xu%2C+G">Gang Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Song%2C+Z">Zhigang Song</a>, <a href="/search/eess?searchtype=author&amp;query=Sun%2C+Z">Zhuo Sun</a>, <a href="/search/eess?searchtype=author&amp;query=Ku%2C+C">Calvin Ku</a>, <a href="/search/eess?searchtype=author&amp;query=Yang%2C+Z">Zhe Yang</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+C">Cancheng Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+S">Shuhao Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Ma%2C+J">Jianpeng Ma</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+W">Wei Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1908.10555v1-abstract-short" style="display: inline;"> Histopathology image analysis plays a critical role in cancer diagnosis and treatment. To automatically segment the cancerous regions, fully supervised segmentation algorithms require labor-intensive and time-consuming labeling at the pixel level. In this research, we propose CAMEL, a weakly supervised learning framework for histopathology image segmentation using only image-level labels. Using mu&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1908.10555v1-abstract-full').style.display = 'inline'; document.getElementById('1908.10555v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1908.10555v1-abstract-full" style="display: none;"> Histopathology image analysis plays a critical role in cancer diagnosis and treatment. To automatically segment the cancerous regions, fully supervised segmentation algorithms require labor-intensive and time-consuming labeling at the pixel level. In this research, we propose CAMEL, a weakly supervised learning framework for histopathology image segmentation using only image-level labels. Using multiple instance learning (MIL)-based label enrichment, CAMEL splits the image into latticed instances and automatically generates instance-level labels. After label enrichment, the instance-level labels are further assigned to the corresponding pixels, producing the approximate pixel-level labels and making fully supervised training of segmentation models possible. CAMEL achieves comparable performance with the fully supervised approaches in both instance-level classification and pixel-level segmentation on CAMELYON16 and a colorectal adenoma dataset. Moreover, the generality of the automatic labeling methodology may benefit future weakly supervised learning studies for histopathology image analysis. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1908.10555v1-abstract-full').style.display = 'none'; document.getElementById('1908.10555v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 August, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 9 figures, accepted by ICCV 2019</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1905.13161">arXiv:1905.13161</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1905.13161">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1088/1741-2552/ab85b2">10.1088/1741-2552/ab85b2 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Simultaneous induction of SSMVEP and SMR Using a Gaiting video stimulus: a novel hybrid brain-computer interface </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+X">Xin Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+G">Guanghua Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Ravi%2C+A">Aravind Ravi</a>, <a href="/search/eess?searchtype=author&amp;query=Pearce%2C+S">Sarah Pearce</a>, <a href="/search/eess?searchtype=author&amp;query=Jiang%2C+N">Ning Jiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1905.13161v1-abstract-short" style="display: inline;"> We proposed a novel visual stimulus for brain-computer interface. The stimulus is in the form gaiting sequence of a human. The hypothesis is that observing such a visual stimulus would simultaneously induce 1) steady-state motion visual evoked potential (SSMVEP) in the occipital area, similarly to an SSVEP stimulus; and 2) sensorimotor rhythm (SMR) in the primary sensorimotor area, because such ac&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1905.13161v1-abstract-full').style.display = 'inline'; document.getElementById('1905.13161v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1905.13161v1-abstract-full" style="display: none;"> We proposed a novel visual stimulus for brain-computer interface. The stimulus is in the form gaiting sequence of a human. The hypothesis is that observing such a visual stimulus would simultaneously induce 1) steady-state motion visual evoked potential (SSMVEP) in the occipital area, similarly to an SSVEP stimulus; and 2) sensorimotor rhythm (SMR) in the primary sensorimotor area, because such action observation (AO) could activate the mirror neuron system. Canonical correlation analysis (CCA) was used to detect SSMVEP from occipital EEG, and event-related spectral perturbations (ERSP) were used to identify SMR in the EEG from the sensorimotor area. The results showed that the proposed visual gaiting stimulus-induced SSMVEP, with classification accuracies of 88.9 $\pm$ 12.0% in a four-class scenario. More importantly, it induced clear and sustained event-related desynchronization/synchronization (ERD/ERS) in the EEG from the sensorimotor area, while no ERD/ERS in the sensorimotor area could be observed when the other two SSVEP stimuli were used. Further, for participants with sufficiently clear SSMVEP pattern (classification accuracy &gt; 85%), the ERD index values in mu-beta band induced by the proposed gaiting stimulus were statistically different from that of the other two types of stimulus. Therefore, a novel BCI based on the proposed stimulus has potential in neurorehabilitation applications because it simultaneously has the high accuracy of an SSMVEP (~90% accuracy in a four-class setup) and the ability to activate sensorimotor cortex. And such potential will be further explored in future studies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1905.13161v1-abstract-full').style.display = 'none'; document.getElementById('1905.13161v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 May, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">22 pages, 7 figures and 2 tables</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Journal of Neural Engineering, Mar. 2020 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1904.06063">arXiv:1904.06063</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1904.06063">pdf</a>, <a href="https://arxiv.org/format/1904.06063">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Building a mixed-lingual neural TTS system with only monolingual data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Xue%2C+L">Liumeng Xue</a>, <a href="/search/eess?searchtype=author&amp;query=Song%2C+W">Wei Song</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+G">Guanghui Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Xie%2C+L">Lei Xie</a>, <a href="/search/eess?searchtype=author&amp;query=Wu%2C+Z">Zhizheng Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1904.06063v2-abstract-short" style="display: inline;"> When deploying a Chinese neural text-to-speech (TTS) synthesis system, one of the challenges is to synthesize Chinese utterances with English phrases or words embedded. This paper looks into the problem in the encoder-decoder framework when only monolingual data from a target speaker is available. Specifically, we view the problem from two aspects: speaker consistency within an utterance and natur&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1904.06063v2-abstract-full').style.display = 'inline'; document.getElementById('1904.06063v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1904.06063v2-abstract-full" style="display: none;"> When deploying a Chinese neural text-to-speech (TTS) synthesis system, one of the challenges is to synthesize Chinese utterances with English phrases or words embedded. This paper looks into the problem in the encoder-decoder framework when only monolingual data from a target speaker is available. Specifically, we view the problem from two aspects: speaker consistency within an utterance and naturalness. We start the investigation with an Average Voice Model which is built from multi-speaker monolingual data, i.e. Mandarin and English data. On the basis of that, we look into speaker embedding for speaker consistency within an utterance and phoneme embedding for naturalness and intelligibility and study the choice of data for model training. We report the findings and discuss the challenges to build a mixed-lingual TTS system with only monolingual data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1904.06063v2-abstract-full').style.display = 'none'; document.getElementById('1904.06063v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 August, 2019; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 April, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To appear in INTERSPEECH 2019</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1812.05329">arXiv:1812.05329</a> <span>&nbsp;&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Wider Channel Attention Network for Remote Sensing Image Super-resolution </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Gu%2C+J">Jun Gu</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+G">Guangluan Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+Y">Yue Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Sun%2C+X">Xian Sun</a>, <a href="/search/eess?searchtype=author&amp;query=Wen%2C+R">Ran Wen</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+L">Lei Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1812.05329v2-abstract-short" style="display: inline;"> Recently, deep convolutional neural networks (CNNs) have obtained promising results in image processing tasks including super-resolution (SR). However, most CNN-based SR methods treat low-resolution (LR) inputs and features equally across channels, rarely notice the loss of information flow caused by the activation function and fail to leverage the representation ability of CNNs. In this letter, w&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1812.05329v2-abstract-full').style.display = 'inline'; document.getElementById('1812.05329v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1812.05329v2-abstract-full" style="display: none;"> Recently, deep convolutional neural networks (CNNs) have obtained promising results in image processing tasks including super-resolution (SR). However, most CNN-based SR methods treat low-resolution (LR) inputs and features equally across channels, rarely notice the loss of information flow caused by the activation function and fail to leverage the representation ability of CNNs. In this letter, we propose a novel single-image super-resolution (SISR) algorithm named Wider Channel Attention Network (WCAN) for remote sensing images. Firstly, the channel attention mechanism is used to adaptively recalibrate the importance of each channel at the middle of the wider attention block (WAB). Secondly, we propose the Local Memory Connection (LMC) to enhance the information flow. Finally, the features within each WAB are fused to take advantage of the network&#39;s representation capability and further improve information and gradient flow. Analytic experiments on a public remote sensing data set (UC Merced) show that our WCAN achieves better accuracy and visual improvements against most state-of-the-art methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1812.05329v2-abstract-full').style.display = 'none'; document.getElementById('1812.05329v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 January, 2019; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 December, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2018. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This work is proposed for remote sensing images, but the idea of the whole paper do not foucs on the characteristics of remote sensing images. The content of the article does not match the title. In this case, we want to do some experiments on the natural images to verify the three tricks in our work</span> </p> </li> </ol> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

Pages: 1 2 3 4 5 6 7 8 9 10