Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 112 results for author: <span class="mathjax">Han, Y</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/eess" aria-role="search"> Searching in archive <strong>eess</strong>. <a href="/search/?searchtype=author&query=Han%2C+Y">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Han, Y"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Han%2C+Y&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Han, Y"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Han%2C+Y&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Han%2C+Y&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Han%2C+Y&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Han%2C+Y&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.03502">arXiv:2502.03502</a> <span> [<a href="https://arxiv.org/pdf/2502.03502">pdf</a>, <a href="https://arxiv.org/format/2502.03502">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> DC-VSR: Spatially and Temporally Consistent Video Super-Resolution with Video Diffusion Prior </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Han%2C+J">Janghyeok Han</a>, <a href="/search/eess?searchtype=author&query=Sim%2C+G">Gyujin Sim</a>, <a href="/search/eess?searchtype=author&query=Kim%2C+G">Geonung Kim</a>, <a href="/search/eess?searchtype=author&query=Lee%2C+H">Hyunseung Lee</a>, <a href="/search/eess?searchtype=author&query=Choi%2C+K">Kyuha Choi</a>, <a href="/search/eess?searchtype=author&query=Han%2C+Y">Youngseok Han</a>, <a href="/search/eess?searchtype=author&query=Cho%2C+S">Sunghyun Cho</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.03502v1-abstract-short" style="display: inline;"> Video super-resolution (VSR) aims to reconstruct a high-resolution (HR) video from a low-resolution (LR) counterpart. Achieving successful VSR requires producing realistic HR details and ensuring both spatial and temporal consistency. To restore realistic details, diffusion-based VSR approaches have recently been proposed. However, the inherent randomness of diffusion, combined with their tile-bas… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03502v1-abstract-full').style.display = 'inline'; document.getElementById('2502.03502v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.03502v1-abstract-full" style="display: none;"> Video super-resolution (VSR) aims to reconstruct a high-resolution (HR) video from a low-resolution (LR) counterpart. Achieving successful VSR requires producing realistic HR details and ensuring both spatial and temporal consistency. To restore realistic details, diffusion-based VSR approaches have recently been proposed. However, the inherent randomness of diffusion, combined with their tile-based approach, often leads to spatio-temporal inconsistencies. In this paper, we propose DC-VSR, a novel VSR approach to produce spatially and temporally consistent VSR results with realistic textures. To achieve spatial and temporal consistency, DC-VSR adopts a novel Spatial Attention Propagation (SAP) scheme and a Temporal Attention Propagation (TAP) scheme that propagate information across spatio-temporal tiles based on the self-attention mechanism. To enhance high-frequency details, we also introduce Detail-Suppression Self-Attention Guidance (DSSAG), a novel diffusion guidance scheme. Comprehensive experiments demonstrate that DC-VSR achieves spatially and temporally consistent, high-quality VSR results, outperforming previous approaches. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03502v1-abstract-full').style.display = 'none'; document.getElementById('2502.03502v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Equal contributions from first two authors</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.01092">arXiv:2502.01092</a> <span> [<a href="https://arxiv.org/pdf/2502.01092">pdf</a>, <a href="https://arxiv.org/format/2502.01092">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Feature Tracking Reliability for Visual Navigation using Real-Time Safety Filter </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Kim%2C+D">Dabin Kim</a>, <a href="/search/eess?searchtype=author&query=Jang%2C+I">Inkyu Jang</a>, <a href="/search/eess?searchtype=author&query=Han%2C+Y">Youngsoo Han</a>, <a href="/search/eess?searchtype=author&query=Hwang%2C+S">Sunwoo Hwang</a>, <a href="/search/eess?searchtype=author&query=Kim%2C+H+J">H. Jin Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.01092v1-abstract-short" style="display: inline;"> Vision sensors are extensively used for localizing a robot's pose, particularly in environments where global localization tools such as GPS or motion capture systems are unavailable. In many visual navigation systems, localization is achieved by detecting and tracking visual features or landmarks, which provide information about the sensor's relative pose. For reliable feature tracking and accurat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01092v1-abstract-full').style.display = 'inline'; document.getElementById('2502.01092v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.01092v1-abstract-full" style="display: none;"> Vision sensors are extensively used for localizing a robot's pose, particularly in environments where global localization tools such as GPS or motion capture systems are unavailable. In many visual navigation systems, localization is achieved by detecting and tracking visual features or landmarks, which provide information about the sensor's relative pose. For reliable feature tracking and accurate pose estimation, it is crucial to maintain visibility of a sufficient number of features. This requirement can sometimes conflict with the robot's overall task objective. In this paper, we approach it as a constrained control problem. By leveraging the invariance properties of visibility constraints within the robot's kinematic model, we propose a real-time safety filter based on quadratic programming. This filter takes a reference velocity command as input and produces a modified velocity that minimally deviates from the reference while ensuring the information score from the currently visible features remains above a user-specified threshold. Numerical simulations demonstrate that the proposed safety filter preserves the invariance condition and ensures the visibility of more features than the required minimum. We also validated its real-world performance by integrating it into a visual simultaneous localization and mapping (SLAM) algorithm, where it maintained high estimation quality in challenging environments, outperforming a simple tracking controller. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01092v1-abstract-full').style.display = 'none'; document.getElementById('2502.01092v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages, 6 figures, Accepted to 2025 IEEE International Conference on Robotics & Automation (ICRA 2025)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.17885">arXiv:2501.17885</a> <span> [<a href="https://arxiv.org/pdf/2501.17885">pdf</a>, <a href="https://arxiv.org/format/2501.17885">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> L-Sort: On-chip Spike Sorting with Efficient Median-of-Median Detection and Localization-based Clustering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Han%2C+Y">Yuntao Han</a>, <a href="/search/eess?searchtype=author&query=Pan%2C+Y">Yihan Pan</a>, <a href="/search/eess?searchtype=author&query=Jiang%2C+X">Xiongfei Jiang</a>, <a href="/search/eess?searchtype=author&query=Sestito%2C+C">Cristian Sestito</a>, <a href="/search/eess?searchtype=author&query=Agwa%2C+S">Shady Agwa</a>, <a href="/search/eess?searchtype=author&query=Prodromakis%2C+T">Themis Prodromakis</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+S">Shiwei Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.17885v1-abstract-short" style="display: inline;"> Spike sorting is a critical process for decoding large-scale neural activity from extracellular recordings. The advancement of neural probes facilitates the recording of a high number of neurons with an increase in channel counts, arising a higher data volume and challenging the current on-chip spike sorters. This paper introduces L-Sort, a novel on-chip spike sorting solution featuring median-of-… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.17885v1-abstract-full').style.display = 'inline'; document.getElementById('2501.17885v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.17885v1-abstract-full" style="display: none;"> Spike sorting is a critical process for decoding large-scale neural activity from extracellular recordings. The advancement of neural probes facilitates the recording of a high number of neurons with an increase in channel counts, arising a higher data volume and challenging the current on-chip spike sorters. This paper introduces L-Sort, a novel on-chip spike sorting solution featuring median-of-median spike detection and localization-based clustering. By combining the median-of-median approximation and the proposed incremental median calculation scheme, our detection module achieves a reduction in memory consumption. Moreover, the localization-based clustering utilizes geometric features instead of morphological features, thus eliminating the memory-consuming buffer for containing the spike waveform during feature extraction. Evaluation using Neuropixels datasets demonstrates that L-Sort achieves competitive sorting accuracy with reduced hardware resource consumption. Implementations on FPGA and ASIC (180 nm technology) demonstrate significant improvements in area and power efficiency compared to state-of-the-art designs while maintaining comparable accuracy. If normalized to 22 nm technology, our design can achieve roughly $\times 10$ area and power efficiency with similar accuracy, compared with the state-of-the-art design evaluated with the same dataset. Therefore, L-Sort is a promising solution for real-time, high-channel-count neural processing in implantable devices. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.17885v1-abstract-full').style.display = 'none'; document.getElementById('2501.17885v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">arXiv admin note: text overlap with arXiv:2406.18425</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> B.7.1 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.15119">arXiv:2501.15119</a> <span> [<a href="https://arxiv.org/pdf/2501.15119">pdf</a>, <a href="https://arxiv.org/format/2501.15119">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Efficient Video Neural Network Processing Based on Motion Estimation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+H">Haichao Wang</a>, <a href="/search/eess?searchtype=author&query=Wen%2C+J">Jiangtao Wen</a>, <a href="/search/eess?searchtype=author&query=Han%2C+Y">Yuxing Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.15119v1-abstract-short" style="display: inline;"> Video neural network (VNN) processing using the conventional pipeline first converts Bayer video information into human understandable RGB videos using image signal processing (ISP) on a pixel by pixel basis. Then, VNN processing is performed on a frame by frame basis. Both ISP and VNN are computationally expensive with high power consumption and latency. In this paper, we propose an efficient VNN… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15119v1-abstract-full').style.display = 'inline'; document.getElementById('2501.15119v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.15119v1-abstract-full" style="display: none;"> Video neural network (VNN) processing using the conventional pipeline first converts Bayer video information into human understandable RGB videos using image signal processing (ISP) on a pixel by pixel basis. Then, VNN processing is performed on a frame by frame basis. Both ISP and VNN are computationally expensive with high power consumption and latency. In this paper, we propose an efficient VNN processing framework. Instead of using ISP, computer vision tasks are directly accomplished using Bayer pattern information. To accelerate VNN processing, motion estimation is introduced to find temporal redundancies in input video data so as to avoid repeated and unnecessary computations. Experiments show greater than 67\% computation reduction, while maintaining computer vision task accuracy for typical computer vision tasks and data sets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15119v1-abstract-full').style.display = 'none'; document.getElementById('2501.15119v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.11844">arXiv:2501.11844</a> <span> [<a href="https://arxiv.org/pdf/2501.11844">pdf</a>, <a href="https://arxiv.org/format/2501.11844">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Keypoint Detection Empowered Near-Field User Localization and Channel Reconstruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+M">Mengyuan Li</a>, <a href="/search/eess?searchtype=author&query=Han%2C+Y">Yu Han</a>, <a href="/search/eess?searchtype=author&query=Lu%2C+Z">Zhizheng Lu</a>, <a href="/search/eess?searchtype=author&query=Jin%2C+S">Shi Jin</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+Y">Yongxu Zhu</a>, <a href="/search/eess?searchtype=author&query=Wen%2C+C">Chao-Kai Wen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.11844v1-abstract-short" style="display: inline;"> In the near-field region of an extremely large-scale multiple-input multiple-output (XL MIMO) system, channel reconstruction is typically addressed through sparse parameter estimation based on compressed sensing (CS) algorithms after converting the received pilot signals into the transformed domain. However, the exhaustive search on the codebook in CS algorithms consumes significant computational… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.11844v1-abstract-full').style.display = 'inline'; document.getElementById('2501.11844v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.11844v1-abstract-full" style="display: none;"> In the near-field region of an extremely large-scale multiple-input multiple-output (XL MIMO) system, channel reconstruction is typically addressed through sparse parameter estimation based on compressed sensing (CS) algorithms after converting the received pilot signals into the transformed domain. However, the exhaustive search on the codebook in CS algorithms consumes significant computational resources and running time, particularly when a large number of antennas are equipped at the base station (BS). To overcome this challenge, we propose a novel scheme to replace the high-cost exhaustive search procedure. We visualize the sparse channel matrix in the transformed domain as a channel image and design the channel keypoint detection network (CKNet) to locate the user and scatterers in high speed. Subsequently, we use a small-scale newtonized orthogonal matching pursuit (NOMP) based refiner to further enhance the precision. Our method is applicable to both the Cartesian domain and the Polar domain. Additionally, to deal with scenarios with a flexible number of propagation paths, we further design FlexibleCKNet to predict both locations and confidence scores. Our experimental results validate that the CKNet and FlexibleCKNet-empowered channel reconstruction scheme can significantly reduce the computational complexity while maintaining high accuracy in both user and scatterer localization and channel reconstruction tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.11844v1-abstract-full').style.display = 'none'; document.getElementById('2501.11844v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.08057">arXiv:2501.08057</a> <span> [<a href="https://arxiv.org/pdf/2501.08057">pdf</a>, <a href="https://arxiv.org/format/2501.08057">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Optimizing Speech Multi-View Feature Fusion through Conditional Computation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Shan%2C+W">Weiqiao Shan</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Y">Yuhao Zhang</a>, <a href="/search/eess?searchtype=author&query=Han%2C+Y">Yuchen Han</a>, <a href="/search/eess?searchtype=author&query=Li%2C+B">Bei Li</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+X">Xiaofeng Zhao</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yuang Li</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+M">Min Zhang</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+H">Hao Yang</a>, <a href="/search/eess?searchtype=author&query=Xiao%2C+T">Tong Xiao</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+J">Jingbo Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.08057v1-abstract-short" style="display: inline;"> Recent advancements have highlighted the efficacy of self-supervised learning (SSL) features in various speech-related tasks, providing lightweight and versatile multi-view speech representations. However, our study reveals that while SSL features expedite model convergence, they conflict with traditional spectral features like FBanks in terms of update directions. In response, we propose a novel… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.08057v1-abstract-full').style.display = 'inline'; document.getElementById('2501.08057v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.08057v1-abstract-full" style="display: none;"> Recent advancements have highlighted the efficacy of self-supervised learning (SSL) features in various speech-related tasks, providing lightweight and versatile multi-view speech representations. However, our study reveals that while SSL features expedite model convergence, they conflict with traditional spectral features like FBanks in terms of update directions. In response, we propose a novel generalized feature fusion framework grounded in conditional computation, featuring a gradient-sensitive gating network and a multi-stage dropout strategy. This framework mitigates feature conflicts and bolsters model robustness to multi-view input features. By integrating SSL and spectral features, our approach accelerates convergence and maintains performance on par with spectral models across multiple speech translation tasks on the MUSTC dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.08057v1-abstract-full').style.display = 'none'; document.getElementById('2501.08057v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICASSP 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.05093">arXiv:2501.05093</a> <span> [<a href="https://arxiv.org/pdf/2501.05093">pdf</a>, <a href="https://arxiv.org/format/2501.05093">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Hierarchical Decomposed Dual-domain Deep Learning for Sparse-View CT Reconstruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Han%2C+Y">Yoseob Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.05093v1-abstract-short" style="display: inline;"> Objective: X-ray computed tomography employing sparse projection views has emerged as a contemporary technique to mitigate radiation dose. However, due to the inadequate number of projection views, an analytic reconstruction method utilizing filtered backprojection results in severe streaking artifacts. Recently, deep learning strategies employing image-domain networks have demonstrated remarkable… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.05093v1-abstract-full').style.display = 'inline'; document.getElementById('2501.05093v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.05093v1-abstract-full" style="display: none;"> Objective: X-ray computed tomography employing sparse projection views has emerged as a contemporary technique to mitigate radiation dose. However, due to the inadequate number of projection views, an analytic reconstruction method utilizing filtered backprojection results in severe streaking artifacts. Recently, deep learning strategies employing image-domain networks have demonstrated remarkable performance in eliminating the streaking artifact caused by analytic reconstruction methods with sparse projection views. Nevertheless, it is difficult to clarify the theoretical justification for applying deep learning to sparse view CT reconstruction, and it has been understood as restoration by removing image artifacts, not reconstruction. Approach: By leveraging the theory of deep convolutional framelets and the hierarchical decomposition of measurement, this research reveals the constraints of conventional image- and projection-domain deep learning methodologies, subsequently, the research proposes a novel dual-domain deep learning framework utilizing hierarchical decomposed measurements. Specifically, the research elucidates how the performance of the projection-domain network can be enhanced through a low-rank property of deep convolutional framelets and a bowtie support of hierarchical decomposed measurement in the Fourier domain. Main Results: This study demonstrated performance improvement of the proposed framework based on the low-rank property, resulting in superior reconstruction performance compared to conventional analytic and deep learning methods. Significance: By providing a theoretically justified deep learning approach for sparse-view CT reconstruction, this study not only offers a superior alternative to existing methods but also opens new avenues for research in medical imaging. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.05093v1-abstract-full').style.display = 'none'; document.getElementById('2501.05093v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published by Physics in Medicine & Biology (2024.4)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.05085">arXiv:2501.05085</a> <span> [<a href="https://arxiv.org/pdf/2501.05085">pdf</a>, <a href="https://arxiv.org/format/2501.05085">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> End-to-End Deep Learning for Interior Tomography with Low-Dose X-ray CT </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Han%2C+Y">Yoseob Han</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+D">Dufan Wu</a>, <a href="/search/eess?searchtype=author&query=Kim%2C+K">Kyungsang Kim</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Q">Quanzheng Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.05085v1-abstract-short" style="display: inline;"> Objective: There exist several X-ray computed tomography (CT) scanning strategies to reduce a radiation dose, such as (1) sparse-view CT, (2) low-dose CT, and (3) region-of-interest (ROI) CT (called interior tomography). To further reduce the dose, the sparse-view and/or low-dose CT settings can be applied together with interior tomography. Interior tomography has various advantages in terms of re… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.05085v1-abstract-full').style.display = 'inline'; document.getElementById('2501.05085v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.05085v1-abstract-full" style="display: none;"> Objective: There exist several X-ray computed tomography (CT) scanning strategies to reduce a radiation dose, such as (1) sparse-view CT, (2) low-dose CT, and (3) region-of-interest (ROI) CT (called interior tomography). To further reduce the dose, the sparse-view and/or low-dose CT settings can be applied together with interior tomography. Interior tomography has various advantages in terms of reducing the number of detectors and decreasing the X-ray radiation dose. However, a large patient or small field-of-view (FOV) detector can cause truncated projections, and then the reconstructed images suffer from severe cupping artifacts. In addition, although the low-dose CT can reduce the radiation exposure dose, analytic reconstruction algorithms produce image noise. Recently, many researchers have utilized image-domain deep learning (DL) approaches to remove each artifact and demonstrated impressive performances, and the theory of deep convolutional framelets supports the reason for the performance improvement. Approach: In this paper, we found that the image-domain convolutional neural network (CNN) is difficult to solve coupled artifacts, based on deep convolutional framelets. Significance: To address the coupled problem, we decouple it into two sub-problems: (i) image domain noise reduction inside truncated projection to solve low-dose CT problem and (ii) extrapolation of projection outside truncated projection to solve the ROI CT problem. The decoupled sub-problems are solved directly with a novel proposed end-to-end learning using dual-domain CNNs. Main results: We demonstrate that the proposed method outperforms the conventional image-domain deep learning methods, and a projection-domain CNN shows better performance than the image-domain CNNs which are commonly used by many researchers. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.05085v1-abstract-full').style.display = 'none'; document.getElementById('2501.05085v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published by Physics in Medicine & Biology (2022.5)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.06624">arXiv:2412.06624</a> <span> [<a href="https://arxiv.org/pdf/2412.06624">pdf</a>, <a href="https://arxiv.org/format/2412.06624">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Fundus Image-based Visual Acuity Assessment with PAC-Guarantees </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Jang%2C+S">Sooyong Jang</a>, <a href="/search/eess?searchtype=author&query=Jang%2C+K+J">Kuk Jin Jang</a>, <a href="/search/eess?searchtype=author&query=Choi%2C+H">Hyonyoung Choi</a>, <a href="/search/eess?searchtype=author&query=Han%2C+Y">Yong-Seop Han</a>, <a href="/search/eess?searchtype=author&query=Lee%2C+S">Seongjin Lee</a>, <a href="/search/eess?searchtype=author&query=Kim%2C+J">Jin-hyun Kim</a>, <a href="/search/eess?searchtype=author&query=Lee%2C+I">Insup Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.06624v1-abstract-short" style="display: inline;"> Timely detection and treatment are essential for maintaining eye health. Visual acuity (VA), which measures the clarity of vision at a distance, is a crucial metric for managing eye health. Machine learning (ML) techniques have been introduced to assist in VA measurement, potentially alleviating clinicians' workloads. However, the inherent uncertainties in ML models make relying solely on them for… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06624v1-abstract-full').style.display = 'inline'; document.getElementById('2412.06624v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.06624v1-abstract-full" style="display: none;"> Timely detection and treatment are essential for maintaining eye health. Visual acuity (VA), which measures the clarity of vision at a distance, is a crucial metric for managing eye health. Machine learning (ML) techniques have been introduced to assist in VA measurement, potentially alleviating clinicians' workloads. However, the inherent uncertainties in ML models make relying solely on them for VA prediction less than ideal. The VA prediction task involves multiple sources of uncertainty, requiring more robust approaches. A promising method is to build prediction sets or intervals rather than point estimates, offering coverage guarantees through techniques like conformal prediction and Probably Approximately Correct (PAC) prediction sets. Despite the potential, to date, these approaches have not been applied to the VA prediction task.To address this, we propose a method for deriving prediction intervals for estimating visual acuity from fundus images with a PAC guarantee. Our experimental results demonstrate that the PAC guarantees are upheld, with performance comparable to or better than that of two prior works that do not provide such guarantees. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06624v1-abstract-full').style.display = 'none'; document.getElementById('2412.06624v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To be published in ML4H 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.04639">arXiv:2412.04639</a> <span> [<a href="https://arxiv.org/pdf/2412.04639">pdf</a>, <a href="https://arxiv.org/format/2412.04639">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Medical Physics">physics.med-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Motion-Guided Deep Image Prior for Cardiac MRI </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Vornehm%2C+M">Marc Vornehm</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+C">Chong Chen</a>, <a href="/search/eess?searchtype=author&query=Sultan%2C+M+A">Muhammad Ahmad Sultan</a>, <a href="/search/eess?searchtype=author&query=Arshad%2C+S+M">Syed Murtaza Arshad</a>, <a href="/search/eess?searchtype=author&query=Han%2C+Y">Yuchi Han</a>, <a href="/search/eess?searchtype=author&query=Knoll%2C+F">Florian Knoll</a>, <a href="/search/eess?searchtype=author&query=Ahmad%2C+R">Rizwan Ahmad</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.04639v1-abstract-short" style="display: inline;"> Cardiovascular magnetic resonance imaging is a powerful diagnostic tool for assessing cardiac structure and function. Traditional breath-held imaging protocols, however, pose challenges for patients with arrhythmias or limited breath-holding capacity. We introduce Motion-Guided Deep Image prior (M-DIP), a novel unsupervised reconstruction framework for accelerated real-time cardiac MRI. M-DIP empl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.04639v1-abstract-full').style.display = 'inline'; document.getElementById('2412.04639v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.04639v1-abstract-full" style="display: none;"> Cardiovascular magnetic resonance imaging is a powerful diagnostic tool for assessing cardiac structure and function. Traditional breath-held imaging protocols, however, pose challenges for patients with arrhythmias or limited breath-holding capacity. We introduce Motion-Guided Deep Image prior (M-DIP), a novel unsupervised reconstruction framework for accelerated real-time cardiac MRI. M-DIP employs a spatial dictionary to synthesize a time-dependent template image, which is further refined using time-dependent deformation fields that model cardiac and respiratory motion. Unlike prior DIP-based methods, M-DIP simultaneously captures physiological motion and frame-to-frame content variations, making it applicable to a wide range of dynamic applications. We validate M-DIP using simulated MRXCAT cine phantom data as well as free-breathing real-time cine and single-shot late gadolinium enhancement data from clinical patients. Comparative analyses against state-of-the-art supervised and unsupervised approaches demonstrate M-DIP's performance and versatility. M-DIP achieved better image quality metrics on phantom data, as well as higher reader scores for in-vivo patient data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.04639v1-abstract-full').style.display = 'none'; document.getElementById('2412.04639v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.01168">arXiv:2412.01168</a> <span> [<a href="https://arxiv.org/pdf/2412.01168">pdf</a>, <a href="https://arxiv.org/format/2412.01168">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> On the Surprising Effectiveness of Spectrum Clipping in Learning Stable Linear Dynamics </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Guo%2C+H">Hanyao Guo</a>, <a href="/search/eess?searchtype=author&query=Han%2C+Y">Yunhai Han</a>, <a href="/search/eess?searchtype=author&query=Ravichandar%2C+H">Harish Ravichandar</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.01168v3-abstract-short" style="display: inline;"> When learning stable linear dynamical systems from data, three important properties are desirable: i) predictive accuracy, ii) provable stability, and iii) computational efficiency. Unconstrained minimization of reconstruction errors leads to high accuracy and efficiency but cannot guarantee stability. Existing methods to remedy this focus on enforcing stability while also ensuring accuracy, but d… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.01168v3-abstract-full').style.display = 'inline'; document.getElementById('2412.01168v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.01168v3-abstract-full" style="display: none;"> When learning stable linear dynamical systems from data, three important properties are desirable: i) predictive accuracy, ii) provable stability, and iii) computational efficiency. Unconstrained minimization of reconstruction errors leads to high accuracy and efficiency but cannot guarantee stability. Existing methods to remedy this focus on enforcing stability while also ensuring accuracy, but do so only at the cost of increased computation. In this work, we investigate if a straightforward approach can simultaneously offer all three desiderata of learning stable linear systems. Specifically, we consider a post-hoc approach that manipulates the spectrum of the learned system matrix after it is learned in an unconstrained fashion. We call this approach spectrum clipping (SC) as it involves eigen decomposition and subsequent reconstruction of the system matrix after clipping all of its eigenvalues that are larger than one to one (without altering the eigenvectors). Through detailed experiments involving two different applications and publicly available benchmark datasets, we demonstrate that this simple technique can simultaneously learn highly accurate linear systems that are provably stable. Notably, we demonstrate that SC can achieve similar or better performance than strong baselines while being orders-of-magnitude faster. We also show that SC can be readily combined with Koopman operators to learn stable nonlinear dynamics, such as those underlying complex dexterous manipulation skills involving multi-fingered robotic hands. Further, we find that SC can learn stable robot policies even when the training data includes unsuccessful or truncated demonstrations. Our codes and dataset can be found at https://github.com/GT-STAR-Lab/spec_clip. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.01168v3-abstract-full').style.display = 'none'; document.getElementById('2412.01168v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Under review by L4DC 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14088">arXiv:2411.14088</a> <span> [<a href="https://arxiv.org/pdf/2411.14088">pdf</a>, <a href="https://arxiv.org/format/2411.14088">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Channel Customization for Low-Complexity CSI Acquisition in Multi-RIS-Assisted MIMO Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Chen%2C+W">Weicong Chen</a>, <a href="/search/eess?searchtype=author&query=Han%2C+Y">Yu Han</a>, <a href="/search/eess?searchtype=author&query=Wen%2C+C">Chao-Kai Wen</a>, <a href="/search/eess?searchtype=author&query=Li%2C+X">Xiao Li</a>, <a href="/search/eess?searchtype=author&query=Jin%2C+S">Shi Jin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14088v1-abstract-short" style="display: inline;"> The deployment of multiple reconfigurable intelligent surfaces (RISs) enhances the propagation environment by improving channel quality, but it also complicates channel estimation. Following the conventional wireless communication system design, which involves full channel state information (CSI) acquisition followed by RIS configuration, can reduce transmission efficiency due to substantial pilot… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14088v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14088v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14088v1-abstract-full" style="display: none;"> The deployment of multiple reconfigurable intelligent surfaces (RISs) enhances the propagation environment by improving channel quality, but it also complicates channel estimation. Following the conventional wireless communication system design, which involves full channel state information (CSI) acquisition followed by RIS configuration, can reduce transmission efficiency due to substantial pilot overhead and computational complexity. This study introduces an innovative approach that integrates CSI acquisition and RIS configuration, leveraging the channel-altering capabilities of the RIS to reduce both the overhead and complexity of CSI acquisition. The focus is on multi-RIS-assisted systems, featuring both direct and reflected propagation paths. By applying a fast-varying reflection sequence during RIS configuration for channel training, the complex problem of channel estimation is decomposed into simpler, independent tasks. These fast-varying reflections effectively isolate transmit signals from different paths, streamlining the CSI acquisition process for both uplink and downlink communications with reduced complexity. In uplink scenarios, a positioning-based algorithm derives partial CSI, informing the adjustment of RIS parameters to create a sparse reflection channel, enabling precise reconstruction of the uplink channel. Downlink communication benefits from this strategically tailored reflection channel, allowing effective CSI acquisition with fewer pilot signals. Simulation results highlight the proposed methodology's ability to accurately reconstruct the reflection channel with minimal impact on the normalized mean square error while simultaneously enhancing spectral efficiency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14088v1-abstract-full').style.display = 'none'; document.getElementById('2411.14088v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by IEEE JSAC special issue on Next Generation Advanced Transceiver Technologies</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01589">arXiv:2411.01589</a> <span> [<a href="https://arxiv.org/pdf/2411.01589">pdf</a>, <a href="https://arxiv.org/format/2411.01589">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> BiT-MamSleep: Bidirectional Temporal Mamba for EEG Sleep Staging </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhou%2C+X">Xinliang Zhou</a>, <a href="/search/eess?searchtype=author&query=Han%2C+Y">Yuzhe Han</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Z">Zhisheng Chen</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+C">Chenyu Liu</a>, <a href="/search/eess?searchtype=author&query=Ding%2C+Y">Yi Ding</a>, <a href="/search/eess?searchtype=author&query=Jia%2C+Z">Ziyu Jia</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+Y">Yang Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01589v2-abstract-short" style="display: inline;"> In this paper, we address the challenges in automatic sleep stage classification, particularly the high computational cost, inadequate modeling of bidirectional temporal dependencies, and class imbalance issues faced by Transformer-based models. To address these limitations, we propose BiT-MamSleep, a novel architecture that integrates the Triple-Resolution CNN (TRCNN) for efficient multi-scale fe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01589v2-abstract-full').style.display = 'inline'; document.getElementById('2411.01589v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01589v2-abstract-full" style="display: none;"> In this paper, we address the challenges in automatic sleep stage classification, particularly the high computational cost, inadequate modeling of bidirectional temporal dependencies, and class imbalance issues faced by Transformer-based models. To address these limitations, we propose BiT-MamSleep, a novel architecture that integrates the Triple-Resolution CNN (TRCNN) for efficient multi-scale feature extraction with the Bidirectional Mamba (BiMamba) mechanism, which models both short- and long-term temporal dependencies through bidirectional processing of EEG data. Additionally, BiT-MamSleep incorporates an Adaptive Feature Recalibration (AFR) module and a temporal enhancement block to dynamically refine feature importance, optimizing classification accuracy without increasing computational complexity. To further improve robustness, we apply optimization techniques such as Focal Loss and SMOTE to mitigate class imbalance. Extensive experiments on four public datasets demonstrate that BiT-MamSleep significantly outperforms state-of-the-art methods, particularly in handling long EEG sequences and addressing class imbalance, leading to more accurate and scalable sleep stage classification. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01589v2-abstract-full').style.display = 'none'; document.getElementById('2411.01589v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.19877">arXiv:2410.19877</a> <span> [<a href="https://arxiv.org/pdf/2410.19877">pdf</a>, <a href="https://arxiv.org/format/2410.19877">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Foundation Models in Electrocardiogram: A Review </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Han%2C+Y">Yu Han</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+X">Xiaofeng Liu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+X">Xiang Zhang</a>, <a href="/search/eess?searchtype=author&query=Ding%2C+C">Cheng Ding</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.19877v2-abstract-short" style="display: inline;"> The electrocardiogram (ECG) is ubiquitous across various healthcare domains, such as cardiac arrhythmia detection and sleep monitoring, making ECG analysis critically essential. Traditional deep learning models for ECG are task-specific, with a narrow scope of functionality and limited generalization capabilities. Recently, foundation models (FMs), also known as large pre-training models, have fun… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19877v2-abstract-full').style.display = 'inline'; document.getElementById('2410.19877v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.19877v2-abstract-full" style="display: none;"> The electrocardiogram (ECG) is ubiquitous across various healthcare domains, such as cardiac arrhythmia detection and sleep monitoring, making ECG analysis critically essential. Traditional deep learning models for ECG are task-specific, with a narrow scope of functionality and limited generalization capabilities. Recently, foundation models (FMs), also known as large pre-training models, have fundamentally reshaped the scheme of model design and representation learning, enhancing the performance across a variety of downstream tasks. This success has drawn interest in the exploration of FMs to address ECG-based medical challenges concurrently. This survey provides a timely, comprehensive and up-to-date overview of FMs for large-scale ECG-FMs. First, we offer a brief background introduction to FMs. Then, we discuss the model architectures, pre-training methods, and adaptation approaches of ECG-FMs from a methodology perspective. Despite the promising opportunities of ECG-FMs, we also outline the challenges and potential future directions. Overall, this survey aims to provide researchers and practitioners with insights into the research of ECG-FMs on theoretical underpinnings, domain-specific applications, and avenues for future exploration. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19877v2-abstract-full').style.display = 'none'; document.getElementById('2410.19877v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.03320">arXiv:2410.03320</a> <span> [<a href="https://arxiv.org/pdf/2410.03320">pdf</a>, <a href="https://arxiv.org/format/2410.03320">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1007/978-3-031-72114-4_40">10.1007/978-3-031-72114-4_40 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Lost in Tracking: Uncertainty-guided Cardiac Cine MRI Segmentation at Right Ventricle Base </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhao%2C+Y">Yidong Zhao</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Y">Yi Zhang</a>, <a href="/search/eess?searchtype=author&query=Simonetti%2C+O">Orlando Simonetti</a>, <a href="/search/eess?searchtype=author&query=Han%2C+Y">Yuchi Han</a>, <a href="/search/eess?searchtype=author&query=Tao%2C+Q">Qian Tao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.03320v2-abstract-short" style="display: inline;"> Accurate biventricular segmentation of cardiac magnetic resonance (CMR) cine images is essential for the clinical evaluation of heart function. However, compared to left ventricle (LV), right ventricle (RV) segmentation is still more challenging and less reproducible. Degenerate performance frequently occurs at the RV base, where the in-plane anatomical structures are complex (with atria, valve, a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03320v2-abstract-full').style.display = 'inline'; document.getElementById('2410.03320v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.03320v2-abstract-full" style="display: none;"> Accurate biventricular segmentation of cardiac magnetic resonance (CMR) cine images is essential for the clinical evaluation of heart function. However, compared to left ventricle (LV), right ventricle (RV) segmentation is still more challenging and less reproducible. Degenerate performance frequently occurs at the RV base, where the in-plane anatomical structures are complex (with atria, valve, and aorta) and vary due to the strong interplanar motion. In this work, we propose to address the currently unsolved issues in CMR segmentation, specifically at the RV base, with two strategies: first, we complemented the public resource by reannotating the RV base in the ACDC dataset, with refined delineation of the right ventricle outflow tract (RVOT), under the guidance of an expert cardiologist. Second, we proposed a novel dual encoder U-Net architecture that leverages temporal incoherence to inform the segmentation when interplanar motions occur. The inter-planar motion is characterized by loss-of-tracking, via Bayesian uncertainty of a motion-tracking model. Our experiments showed that our method significantly improved RV base segmentation taking into account temporal incoherence. Furthermore, we investigated the reproducibility of deep learning-based segmentation and showed that the combination of consistent annotation and loss of tracking could enhance the reproducibility of RV segmentation, potentially facilitating a large number of clinical studies focusing on RV. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03320v2-abstract-full').style.display = 'none'; document.getElementById('2410.03320v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.15105">arXiv:2409.15105</a> <span> [<a href="https://arxiv.org/pdf/2409.15105">pdf</a>, <a href="https://arxiv.org/format/2409.15105">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> SPformer: A Transformer Based DRL Decision Making Method for Connected Automated Vehicles </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Han%2C+Y">Ye Han</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+L">Lijun Zhang</a>, <a href="/search/eess?searchtype=author&query=Meng%2C+D">Dejian Meng</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+X">Xingyu Hu</a>, <a href="/search/eess?searchtype=author&query=Lu%2C+Y">Yixia Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.15105v1-abstract-short" style="display: inline;"> In mixed autonomy traffic environment, every decision made by an autonomous-driving car may have a great impact on the transportation system. Because of the complex interaction between vehicles, it is challenging to make decisions that can ensure both high traffic efficiency and safety now and futher. Connected automated vehicles (CAVs) have great potential to improve the quality of decision-makin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15105v1-abstract-full').style.display = 'inline'; document.getElementById('2409.15105v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.15105v1-abstract-full" style="display: none;"> In mixed autonomy traffic environment, every decision made by an autonomous-driving car may have a great impact on the transportation system. Because of the complex interaction between vehicles, it is challenging to make decisions that can ensure both high traffic efficiency and safety now and futher. Connected automated vehicles (CAVs) have great potential to improve the quality of decision-making in this continuous, highly dynamic and interactive environment because of their stronger sensing and communicating ability. For multi-vehicle collaborative decision-making algorithms based on deep reinforcement learning (DRL), we need to represent the interactions between vehicles to obtain interactive features. The representation in this aspect directly affects the learning efficiency and the quality of the learned policy. To this end, we propose a CAV decision-making architecture based on transformer and reinforcement learning algorithms. A learnable policy token is used as the learning medium of the multi-vehicle joint policy, the states of all vehicles in the area of interest can be adaptively noticed in order to extract interactive features among agents. We also design an intuitive physical positional encodings, the redundant location information of which optimizes the performance of the network. Simulations show that our model can make good use of all the state information of vehicles in traffic scenario, so as to obtain high-quality driving decisions that meet efficiency and safety objectives. The comparison shows that our method significantly improves existing DRL-based multi-vehicle cooperative decision-making algorithms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15105v1-abstract-full').style.display = 'none'; document.getElementById('2409.15105v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.13783">arXiv:2409.13783</a> <span> [<a href="https://arxiv.org/pdf/2409.13783">pdf</a>, <a href="https://arxiv.org/format/2409.13783">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> A Value Based Parallel Update MCTS Method for Multi-Agent Cooperative Decision Making of Connected and Automated Vehicles </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Han%2C+Y">Ye Han</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+L">Lijun Zhang</a>, <a href="/search/eess?searchtype=author&query=Meng%2C+D">Dejian Meng</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+X">Xingyu Hu</a>, <a href="/search/eess?searchtype=author&query=Weng%2C+S">Songyu Weng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.13783v1-abstract-short" style="display: inline;"> To solve the problem of lateral and logitudinal joint decision-making of multi-vehicle cooperative driving for connected and automated vehicles (CAVs), this paper proposes a Monte Carlo tree search (MCTS) method with parallel update for multi-agent Markov game with limited horizon and time discounted setting. By analyzing the parallel actions in the multi-vehicle joint action space in the partial-… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13783v1-abstract-full').style.display = 'inline'; document.getElementById('2409.13783v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.13783v1-abstract-full" style="display: none;"> To solve the problem of lateral and logitudinal joint decision-making of multi-vehicle cooperative driving for connected and automated vehicles (CAVs), this paper proposes a Monte Carlo tree search (MCTS) method with parallel update for multi-agent Markov game with limited horizon and time discounted setting. By analyzing the parallel actions in the multi-vehicle joint action space in the partial-steady-state traffic flow, the parallel update method can quickly exclude potential dangerous actions, thereby increasing the search depth without sacrificing the search breadth. The proposed method is tested in a large number of randomly generated traffic flow. The experiment results show that the algorithm has good robustness and better performance than the SOTA reinforcement learning algorithms and heuristic methods. The vehicle driving strategy using the proposed algorithm shows rationality beyond human drivers, and has advantages in traffic efficiency and safety in the coordinating zone. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13783v1-abstract-full').style.display = 'none'; document.getElementById('2409.13783v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">arXiv admin note: text overlap with arXiv:2408.04295 by other authors</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.13067">arXiv:2409.13067</a> <span> [<a href="https://arxiv.org/pdf/2409.13067">pdf</a>, <a href="https://arxiv.org/format/2409.13067">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> E-Sort: Empowering End-to-end Neural Network for Multi-channel Spike Sorting with Transfer Learning and Fast Post-processing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Han%2C+Y">Yuntao Han</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+S">Shiwei Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.13067v2-abstract-short" style="display: inline;"> Decoding extracellular recordings is a crucial task in electrophysiology and brain-computer interfaces. Spike sorting, which distinguishes spikes and their putative neurons from extracellular recordings, becomes computationally demanding with the increasing number of channels in modern neural probes. To address the intensive workload and complex neuron interactions, we propose E-Sort, an end-to-en… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13067v2-abstract-full').style.display = 'inline'; document.getElementById('2409.13067v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.13067v2-abstract-full" style="display: none;"> Decoding extracellular recordings is a crucial task in electrophysiology and brain-computer interfaces. Spike sorting, which distinguishes spikes and their putative neurons from extracellular recordings, becomes computationally demanding with the increasing number of channels in modern neural probes. To address the intensive workload and complex neuron interactions, we propose E-Sort, an end-to-end neural network-based spike sorter with transfer learning and parallelizable post-processing. Our framework reduces the required number of annotated spikes for training by 44% compared to training from scratch, achieving up to 25.68% higher accuracy. Additionally, our novel post-processing algorithm is compatible with deep learning frameworks, making E-Sort significantly faster than state-of-the-art spike sorters. On synthesized Neuropixels recordings, E-Sort achieves comparable accuracy with Kilosort4 while sorting 50 seconds of data in only 1.32 seconds. Our method demonstrates robustness across various probe geometries, noise levels, and drift conditions, offering a substantial improvement in both accuracy and runtime efficiency compared to existing spike sorters. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13067v2-abstract-full').style.display = 'none'; document.getElementById('2409.13067v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.6; J.3 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.10737">arXiv:2408.10737</a> <span> [<a href="https://arxiv.org/pdf/2408.10737">pdf</a>, <a href="https://arxiv.org/format/2408.10737">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Mid-Band Extra Large-Scale MIMO System: Channel Modeling and Performance Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Tian%2C+J">Jiachen Tian</a>, <a href="/search/eess?searchtype=author&query=Han%2C+Y">Yu Han</a>, <a href="/search/eess?searchtype=author&query=Li%2C+X">Xiao Li</a>, <a href="/search/eess?searchtype=author&query=Jin%2C+S">Shi Jin</a>, <a href="/search/eess?searchtype=author&query=Wen%2C+C">Chao-Kai Wen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.10737v1-abstract-short" style="display: inline;"> In pursuit of enhanced quality of service and higher transmission rates, communication within the mid-band spectrum, such as bands in the 6-15 GHz range, combined with extra large-scale multiple-input multiple-output (XL-MIMO), is considered a potential enabler for future communication systems. However, the characteristics introduced by mid-band XL-MIMO systems pose challenges for channel modeling… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10737v1-abstract-full').style.display = 'inline'; document.getElementById('2408.10737v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.10737v1-abstract-full" style="display: none;"> In pursuit of enhanced quality of service and higher transmission rates, communication within the mid-band spectrum, such as bands in the 6-15 GHz range, combined with extra large-scale multiple-input multiple-output (XL-MIMO), is considered a potential enabler for future communication systems. However, the characteristics introduced by mid-band XL-MIMO systems pose challenges for channel modeling and performance analysis. In this paper, we first analyze the potential characteristics of mid-band MIMO channels. Then, an analytical channel model incorporating novel channel characteristics is proposed, based on a review of classical analytical channel models. This model is convenient for theoretical analysis and compatible with other analytical channel models. Subsequently, based on the proposed channel model, we analyze key metrics of wireless communication, including the ergodic spectral efficiency (SE) and outage probability (OP) of MIMO maximal-ratio combining systems. Specifically, we derive closed-form approximations and performance bounds for two typical scenarios, aiming to illustrate the influence of mid-band XL-MIMO systems. Finally, comparisons between systems under different practical configurations are carried out through simulations. The theoretical analysis and simulations demonstrate that mid-band XL-MIMO systems excel in SE and OP due to the increased array elements, moderate large-scale fading, and enlarged transmission bandwidth. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10737v1-abstract-full').style.display = 'none'; document.getElementById('2408.10737v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages, 10 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.11705">arXiv:2407.11705</a> <span> [<a href="https://arxiv.org/pdf/2407.11705">pdf</a>, <a href="https://arxiv.org/format/2407.11705">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Snail-Radar: A large-scale diverse dataset for the evaluation of 4D-radar-based SLAM systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Huai%2C+J">Jianzhu Huai</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+B">Binliang Wang</a>, <a href="/search/eess?searchtype=author&query=Zhuang%2C+Y">Yuan Zhuang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yiwen Chen</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Q">Qipeng Li</a>, <a href="/search/eess?searchtype=author&query=Han%2C+Y">Yulong Han</a>, <a href="/search/eess?searchtype=author&query=Toth%2C+C">Charles Toth</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.11705v2-abstract-short" style="display: inline;"> 4D radars are increasingly favored for odometry and mapping of autonomous systems due to their robustness in harsh weather and dynamic environments. Existing datasets, however, often cover limited areas and are typically captured using a single platform. To address this gap, we present a diverse large-scale dataset specifically designed for 4D radar-based localization and mapping. This dataset was… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.11705v2-abstract-full').style.display = 'inline'; document.getElementById('2407.11705v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.11705v2-abstract-full" style="display: none;"> 4D radars are increasingly favored for odometry and mapping of autonomous systems due to their robustness in harsh weather and dynamic environments. Existing datasets, however, often cover limited areas and are typically captured using a single platform. To address this gap, we present a diverse large-scale dataset specifically designed for 4D radar-based localization and mapping. This dataset was gathered using three different platforms: a handheld device, an e-bike, and an SUV, under a variety of environmental conditions, including clear days, nighttime, and heavy rain. The data collection occurred from September 2023 to February 2024, encompassing diverse settings such as roads in a vegetated campus and tunnels on highways. Each route was traversed multiple times to facilitate place recognition evaluations. The sensor suite included a 3D lidar, 4D radars, stereo cameras, consumer-grade IMUs, and a GNSS/INS system. Sensor data packets were synchronized to GNSS time using a two-step process: a convex hull algorithm was applied to smooth host time jitter, and then odometry and correlation algorithms were used to correct constant time offsets. Extrinsic calibration between sensors was achieved through manual measurements and subsequent nonlinear optimization. The reference motion for the platforms was generated by registering lidar scans to a terrestrial laser scanner (TLS) point cloud map using a lidar inertial odometry (LIO) method in localization mode. Additionally, a data reversion technique was introduced to enable backward LIO processing. We believe this dataset will boost research in radar-based point cloud registration, odometry, mapping, and place recognition. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.11705v2-abstract-full').style.display = 'none'; document.getElementById('2407.11705v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 4 figures, 5 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.10377">arXiv:2407.10377</a> <span> [<a href="https://arxiv.org/pdf/2407.10377">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Enhanced Masked Image Modeling to Avoid Model Collapse on Multi-modal MRI Datasets </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Han%2C+L">Linxuan Han</a>, <a href="/search/eess?searchtype=author&query=Xiao%2C+S">Sa Xiao</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Z">Zimeng Li</a>, <a href="/search/eess?searchtype=author&query=Li%2C+H">Haidong Li</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+X">Xiuchao Zhao</a>, <a href="/search/eess?searchtype=author&query=Han%2C+Y">Yeqing Han</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+F">Fumin Guo</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+X">Xin Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.10377v4-abstract-short" style="display: inline;"> Multi-modal magnetic resonance imaging (MRI) provides information of lesions for computer-aided diagnosis from different views. Deep learning algorithms are suitable for identifying specific anatomical structures, segmenting lesions, and classifying diseases. Manual labels are limited due to the high expense, which hinders further improvement of accuracy. Self-supervised learning, particularly mas… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.10377v4-abstract-full').style.display = 'inline'; document.getElementById('2407.10377v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.10377v4-abstract-full" style="display: none;"> Multi-modal magnetic resonance imaging (MRI) provides information of lesions for computer-aided diagnosis from different views. Deep learning algorithms are suitable for identifying specific anatomical structures, segmenting lesions, and classifying diseases. Manual labels are limited due to the high expense, which hinders further improvement of accuracy. Self-supervised learning, particularly masked image modeling (MIM), has shown promise in utilizing unlabeled data. However, we spot model collapse when applying MIM to multi-modal MRI datasets. The performance of downstream tasks does not see any improvement following the collapsed model. To solve model collapse, we analyze and address it in two types: complete collapse and dimensional collapse. We find complete collapse occurs because the collapsed loss value in multi-modal MRI datasets falls below the normally converged loss value. Based on this, the hybrid mask pattern (HMP) masking strategy is introduced to elevate the collapsed loss above the normally converged loss value and avoid complete collapse. Additionally, we reveal that dimensional collapse stems from insufficient feature uniformity in MIM. We mitigate dimensional collapse by introducing the pyramid barlow twins (PBT) module as an explicit regularization method. Overall, we construct the enhanced MIM (E-MIM) with HMP and PBT module to avoid model collapse multi-modal MRI. Experiments are conducted on three multi-modal MRI datasets to validate the effectiveness of our approach in preventing both types of model collapse. By preventing model collapse, the training of the model becomes more stable, resulting in a decent improvement in performance for segmentation and classification tasks. The code is available at https://github.com/LinxuanHan/E-MIM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.10377v4-abstract-full').style.display = 'none'; document.getElementById('2407.10377v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This work has been submitted to the lEEE for possible publication. copyright may be transferred without notice, after which this version may no longer be accessible</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.19769">arXiv:2406.19769</a> <span> [<a href="https://arxiv.org/pdf/2406.19769">pdf</a>, <a href="https://arxiv.org/format/2406.19769">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Decision Transformer for IRS-Assisted Systems with Diffusion-Driven Generative Channels </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+J">Jie Zhang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+J">Jun Li</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Z">Zhe Wang</a>, <a href="/search/eess?searchtype=author&query=Han%2C+Y">Yu Han</a>, <a href="/search/eess?searchtype=author&query=Shi%2C+L">Long Shi</a>, <a href="/search/eess?searchtype=author&query=Cao%2C+B">Bin Cao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.19769v1-abstract-short" style="display: inline;"> In this paper, we propose a novel diffusion-decision transformer (D2T) architecture to optimize the beamforming strategies for intelligent reflecting surface (IRS)-assisted multiple-input single-output (MISO) communication systems. The first challenge lies in the expensive computation cost to recover the real-time channel state information (CSI) from the received pilot signals, which usually requi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.19769v1-abstract-full').style.display = 'inline'; document.getElementById('2406.19769v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.19769v1-abstract-full" style="display: none;"> In this paper, we propose a novel diffusion-decision transformer (D2T) architecture to optimize the beamforming strategies for intelligent reflecting surface (IRS)-assisted multiple-input single-output (MISO) communication systems. The first challenge lies in the expensive computation cost to recover the real-time channel state information (CSI) from the received pilot signals, which usually requires prior knowledge of the channel distributions. To reduce the channel estimation complexity, we adopt a diffusion model to automatically learn the mapping between the received pilot signals and channel matrices in a model-free manner. The second challenge is that, the traditional optimization or reinforcement learning (RL) algorithms cannot guarantee the optimality of the beamforming policies once the channel distribution changes, and it is costly to resolve the optimized strategies. To enhance the generality of the decision models over varying channel distributions, we propose an offline pre-training and online fine-tuning decision transformer (DT) framework, wherein we first pre-train the DT offline with the data samples collected by the RL algorithms under diverse channel distributions, and then fine-tune the DT online with few-shot samples under a new channel distribution for a generalization purpose. Simulation results demonstrate that, compared with retraining RL algorithms, the proposed D2T algorithm boosts the convergence speed by 3 times with only a few samples from the new channel distribution while enhancing the average user data rate by 6%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.19769v1-abstract-full').style.display = 'none'; document.getElementById('2406.19769v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.18425">arXiv:2406.18425</a> <span> [<a href="https://arxiv.org/pdf/2406.18425">pdf</a>, <a href="https://arxiv.org/format/2406.18425">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/BioCAS61083.2024.10798317">10.1109/BioCAS61083.2024.10798317 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> L-Sort: An Efficient Hardware for Real-time Multi-channel Spike Sorting with Localization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Han%2C+Y">Yuntao Han</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+S">Shiwei Wang</a>, <a href="/search/eess?searchtype=author&query=Hamilton%2C+A">Alister Hamilton</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.18425v1-abstract-short" style="display: inline;"> Spike sorting is essential for extracting neuronal information from neural signals and understanding brain function. With the advent of high-density microelectrode arrays (HDMEAs), the challenges and opportunities in multi-channel spike sorting have intensified. Real-time spike sorting is particularly crucial for closed-loop brain computer interface (BCI) applications, demanding efficient hardware… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.18425v1-abstract-full').style.display = 'inline'; document.getElementById('2406.18425v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.18425v1-abstract-full" style="display: none;"> Spike sorting is essential for extracting neuronal information from neural signals and understanding brain function. With the advent of high-density microelectrode arrays (HDMEAs), the challenges and opportunities in multi-channel spike sorting have intensified. Real-time spike sorting is particularly crucial for closed-loop brain computer interface (BCI) applications, demanding efficient hardware implementations. This paper introduces L-Sort, an hardware design for real-time multi-channel spike sorting. Leveraging spike localization techniques, L-Sort achieves efficient spike detection and clustering without the need to store raw signals during detection. By incorporating median thresholding and geometric features, L-Sort demonstrates promising results in terms of accuracy and hardware efficiency. We assessed the detection and clustering accuracy of our design with publicly available datasets recorded using high-density neural probes (Neuropixel). We implemented our design on an FPGA and compared the results with state of the art. Results show that our designs consume less hardware resource comparing with other FPGA-based spike sorting hardware. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.18425v1-abstract-full').style.display = 'none'; document.getElementById('2406.18425v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> B.7.1 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.03706">arXiv:2406.03706</a> <span> [<a href="https://arxiv.org/pdf/2406.03706">pdf</a>, <a href="https://arxiv.org/format/2406.03706">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Improving Audio Codec-based Zero-Shot Text-to-Speech Synthesis with Multi-Modal Context and Large Language Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Xue%2C+J">Jinlong Xue</a>, <a href="/search/eess?searchtype=author&query=Deng%2C+Y">Yayue Deng</a>, <a href="/search/eess?searchtype=author&query=Han%2C+Y">Yicheng Han</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+Y">Yingming Gao</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Ya Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.03706v1-abstract-short" style="display: inline;"> Recent advances in large language models (LLMs) and development of audio codecs greatly propel the zero-shot TTS. They can synthesize personalized speech with only a 3-second speech of an unseen speaker as acoustic prompt. However, they only support short speech prompts and cannot leverage longer context information, as required in audiobook and conversational TTS scenarios. In this paper, we intr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.03706v1-abstract-full').style.display = 'inline'; document.getElementById('2406.03706v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.03706v1-abstract-full" style="display: none;"> Recent advances in large language models (LLMs) and development of audio codecs greatly propel the zero-shot TTS. They can synthesize personalized speech with only a 3-second speech of an unseen speaker as acoustic prompt. However, they only support short speech prompts and cannot leverage longer context information, as required in audiobook and conversational TTS scenarios. In this paper, we introduce a novel audio codec-based TTS model to adapt context features with multiple enhancements. Inspired by the success of Qformer, we propose a multi-modal context-enhanced Qformer (MMCE-Qformer) to utilize additional multi-modal context information. Besides, we adapt a pretrained LLM to leverage its understanding ability to predict semantic tokens, and use a SoundStorm to generate acoustic tokens thereby enhancing audio quality and speaker similarity. The extensive objective and subjective evaluations show that our proposed method outperforms baselines across various context TTS scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.03706v1-abstract-full').style.display = 'none'; document.getElementById('2406.03706v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by Interspeech 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.20969">arXiv:2405.20969</a> <span> [<a href="https://arxiv.org/pdf/2405.20969">pdf</a>, <a href="https://arxiv.org/format/2405.20969">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Design, Calibration, and Control of Compliant Force-sensing Gripping Pads for Humanoid Robots </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Han%2C+Y">Yuanfeng Han</a>, <a href="/search/eess?searchtype=author&query=Jiang%2C+B">Boren Jiang</a>, <a href="/search/eess?searchtype=author&query=Chirikjian%2C+G+S">Gregory S. Chirikjian</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.20969v1-abstract-short" style="display: inline;"> This paper introduces a pair of low-cost, light-weight and compliant force-sensing gripping pads used for manipulating box-like objects with smaller-sized humanoid robots. These pads measure normal gripping forces and center of pressure (CoP). A calibration method is developed to improve the CoP measurement accuracy. A hybrid force-alignment-position control framework is proposed to regulate the g… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.20969v1-abstract-full').style.display = 'inline'; document.getElementById('2405.20969v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.20969v1-abstract-full" style="display: none;"> This paper introduces a pair of low-cost, light-weight and compliant force-sensing gripping pads used for manipulating box-like objects with smaller-sized humanoid robots. These pads measure normal gripping forces and center of pressure (CoP). A calibration method is developed to improve the CoP measurement accuracy. A hybrid force-alignment-position control framework is proposed to regulate the gripping forces and to ensure the surface alignment between the grippers and the object. Limit surface theory is incorporated as a contact friction modeling approach to determine the magnitude of gripping forces for slippage avoidance. The integrated hardware and software system is demonstrated with a NAO humanoid robot. Experiments show the effectiveness of the overall approach. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.20969v1-abstract-full').style.display = 'none'; document.getElementById('2405.20969v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">21 pages, 16 figures, Published in ASME Journal of Mechanisms and Robotics</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Journal of Mechanisms and Robotics, 15, 031010,2023 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.16715">arXiv:2405.16715</a> <span> [<a href="https://arxiv.org/pdf/2405.16715">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Coil Reweighting to Suppress Motion Artifacts in Real-Time Exercise Cine Imaging </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Chen%2C+C">Chong Chen</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+Y">Yingmin Liu</a>, <a href="/search/eess?searchtype=author&query=Ding%2C+Y">Yu Ding</a>, <a href="/search/eess?searchtype=author&query=Tong%2C+M">Matthew Tong</a>, <a href="/search/eess?searchtype=author&query=Chandrasekaran%2C+P">Preethi Chandrasekaran</a>, <a href="/search/eess?searchtype=author&query=Crabtree%2C+C">Christopher Crabtree</a>, <a href="/search/eess?searchtype=author&query=Arshad%2C+S+M">Syed M. Arshad</a>, <a href="/search/eess?searchtype=author&query=Han%2C+Y">Yuchi Han</a>, <a href="/search/eess?searchtype=author&query=Ahmad%2C+R">Rizwan Ahmad</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.16715v1-abstract-short" style="display: inline;"> Background: Accelerated real-time cine (RT-Cine) imaging enables cardiac function assessment without the need for breath-holding. However, when performed during in-magnet exercise, RT-Cine images may exhibit significant motion artifacts. Methods: By projecting the time-averaged images to the subspace spanned by the coil sensitivity maps, we propose a coil reweighting (CR) method to automatically s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.16715v1-abstract-full').style.display = 'inline'; document.getElementById('2405.16715v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.16715v1-abstract-full" style="display: none;"> Background: Accelerated real-time cine (RT-Cine) imaging enables cardiac function assessment without the need for breath-holding. However, when performed during in-magnet exercise, RT-Cine images may exhibit significant motion artifacts. Methods: By projecting the time-averaged images to the subspace spanned by the coil sensitivity maps, we propose a coil reweighting (CR) method to automatically suppress a subset of receive coils that introduces a high level of artifacts in the reconstructed image. RT-Cine data collected at rest and during exercise from ten healthy volunteers and six patients were utilized to assess the performance of the proposed method. One short-axis and one two-chamber RT-Cine series reconstructed with and without CR from each subject were visually scored by two cardiologists in terms of the level of artifacts on a scale of 1 (worst) to 5 (best). Results: For healthy volunteers, applying CR to RT-Cine images collected at rest did not significantly change the image quality score (p=1). In contrast, for RT-Cine images collected during exercise, CR significantly improved the score from 3.9 to 4.68 (p<0.001). Similarly, in patients, CR did not significantly change the score for images collected at rest (p=0.031) but markedly improved the score from 3.15 to 4.42 (p<0.001) for images taken during exercise. Despite lower image quality scores in the patient cohort compared to healthy subjects, likely due to larger body habitus and the difficulty of limiting body motion during exercise, CR effectively suppressed motion artifacts, with all image series from the patient cohort receiving a score of four or higher. Conclusion: Using data from healthy subjects and patients, we demonstrate that the motion artifacts in the reconstructed RT-Cine images can be effectively suppressed significantly with the proposed CR method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.16715v1-abstract-full').style.display = 'none'; document.getElementById('2405.16715v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.00367">arXiv:2405.00367</a> <span> [<a href="https://arxiv.org/pdf/2405.00367">pdf</a>, <a href="https://arxiv.org/format/2405.00367">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3626772.3657976">10.1145/3626772.3657976 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Distance Sampling-based Paraphraser Leveraging ChatGPT for Text Data Manipulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Oh%2C+Y">Yoori Oh</a>, <a href="/search/eess?searchtype=author&query=Han%2C+Y">Yoseob Han</a>, <a href="/search/eess?searchtype=author&query=Lee%2C+K">Kyogu Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.00367v1-abstract-short" style="display: inline;"> There has been growing interest in audio-language retrieval research, where the objective is to establish the correlation between audio and text modalities. However, most audio-text paired datasets often lack rich expression of the text data compared to the audio samples. One of the significant challenges facing audio-text datasets is the presence of similar or identical captions despite different… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.00367v1-abstract-full').style.display = 'inline'; document.getElementById('2405.00367v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.00367v1-abstract-full" style="display: none;"> There has been growing interest in audio-language retrieval research, where the objective is to establish the correlation between audio and text modalities. However, most audio-text paired datasets often lack rich expression of the text data compared to the audio samples. One of the significant challenges facing audio-text datasets is the presence of similar or identical captions despite different audio samples. Therefore, under many-to-one mapping conditions, audio-text datasets lead to poor performance of retrieval tasks. In this paper, we propose a novel approach to tackle the data imbalance problem in audio-language retrieval task. To overcome the limitation, we introduce a method that employs a distance sampling-based paraphraser leveraging ChatGPT, utilizing distance function to generate a controllable distribution of manipulated text data. For a set of sentences with the same context, the distance is used to calculate a degree of manipulation for any two sentences, and ChatGPT's few-shot prompting is performed using a text cluster with a similar distance defined by the Jaccard similarity. Therefore, ChatGPT, when applied to few-shot prompting with text clusters, can adjust the diversity of the manipulated text based on the distance. The proposed approach is shown to significantly enhance performance in audio-text retrieval, outperforming conventional text augmentation techniques. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.00367v1-abstract-full').style.display = 'none'; document.getElementById('2405.00367v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at SIGIR 2024 short paper track</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.16318">arXiv:2404.16318</a> <span> [<a href="https://arxiv.org/pdf/2404.16318">pdf</a>, <a href="https://arxiv.org/format/2404.16318">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> The Continuous-Time Weighted-Median Opinion Dynamics </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Han%2C+Y">Yi Han</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+G">Ge Chen</a>, <a href="/search/eess?searchtype=author&query=D%C3%B6rfler%2C+F">Florian D枚rfler</a>, <a href="/search/eess?searchtype=author&query=Mei%2C+W">Wenjun Mei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.16318v2-abstract-short" style="display: inline;"> Opinion dynamics models are important in understanding and predicting opinion formation processes within social groups. Although the weighted-averaging opinion-update mechanism is widely adopted as the micro-foundation of opinion dynamics, it bears a non-negligibly unrealistic implication: opinion attractiveness increases with opinion distance. Recently, the weighted-median mechanism has been prop… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.16318v2-abstract-full').style.display = 'inline'; document.getElementById('2404.16318v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.16318v2-abstract-full" style="display: none;"> Opinion dynamics models are important in understanding and predicting opinion formation processes within social groups. Although the weighted-averaging opinion-update mechanism is widely adopted as the micro-foundation of opinion dynamics, it bears a non-negligibly unrealistic implication: opinion attractiveness increases with opinion distance. Recently, the weighted-median mechanism has been proposed as a new microscopic mechanism of opinion exchange. Numerous advancements have been achieved regarding this new micro-foundation, from theoretical analysis to empirical validation, in a discrete-time asynchronous setup. However, the original discrete-time weighted-median model does not allow for "compromise behavior" in opinion exchanges, i.e., no intermediate opinions are created between disagreeing agents. To resolve this problem, this paper propose a novel continuous-time weighted-median opinion dynamics model, in which agents' opinions move towards the weighted-medians of their out-neighbors' opinions. It turns out that the proof methods for the original discrete-time asynchronous model are no longer applicable to the analysis of the continuous-time model. In this paper, we first establish the existence and uniqueness of the solution to the continuous-time weighted-median opinion dynamics by showing that the weighted-median mapping is contractive on any graph. We also characterize the set of all the equilibria. Then, by leveraging a new LaSalle invariance principle argument, we prove the convergence of the continuous-time weighted-median model for any initial condition and derive a necessary and sufficient condition for the convergence to consensus. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.16318v2-abstract-full').style.display = 'none'; document.getElementById('2404.16318v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 1 figure</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 91D30(Primary) 93A16(Secondary) </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.08580">arXiv:2403.08580</a> <span> [<a href="https://arxiv.org/pdf/2403.08580">pdf</a>, <a href="https://arxiv.org/format/2403.08580">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Leveraging Compressed Frame Sizes For Ultra-Fast Video Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Han%2C+Y">Yuxing Han</a>, <a href="/search/eess?searchtype=author&query=Ding%2C+Y">Yunan Ding</a>, <a href="/search/eess?searchtype=author&query=Gan%2C+C+Y">Chen Ye Gan</a>, <a href="/search/eess?searchtype=author&query=Wen%2C+J">Jiangtao Wen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.08580v1-abstract-short" style="display: inline;"> Classifying videos into distinct categories, such as Sport and Music Video, is crucial for multimedia understanding and retrieval, especially when an immense volume of video content is being constantly generated. Traditional methods require video decompression to extract pixel-level features like color, texture, and motion, thereby increasing computational and storage demands. Moreover, these meth… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.08580v1-abstract-full').style.display = 'inline'; document.getElementById('2403.08580v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.08580v1-abstract-full" style="display: none;"> Classifying videos into distinct categories, such as Sport and Music Video, is crucial for multimedia understanding and retrieval, especially when an immense volume of video content is being constantly generated. Traditional methods require video decompression to extract pixel-level features like color, texture, and motion, thereby increasing computational and storage demands. Moreover, these methods often suffer from performance degradation in low-quality videos. We present a novel approach that examines only the post-compression bitstream of a video to perform classification, eliminating the need for bitstream decoding. To validate our approach, we built a comprehensive data set comprising over 29,000 YouTube video clips, totaling 6,000 hours and spanning 11 distinct categories. Our evaluations indicate precision, accuracy, and recall rates consistently above 80%, many exceeding 90%, and some reaching 99%. The algorithm operates approximately 15,000 times faster than real-time for 30fps videos, outperforming traditional Dynamic Time Warping (DTW) algorithm by seven orders of magnitude. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.08580v1-abstract-full').style.display = 'none'; document.getElementById('2403.08580v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 5 figures, 1 table. arXiv admin note: substantial text overlap with arXiv:2309.07361</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.06998">arXiv:2403.06998</a> <span> [<a href="https://arxiv.org/pdf/2403.06998">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> </div> </div> <p class="title is-5 mathjax"> High-speed Low-consumption sEMG-based Transient-state micro-Gesture Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Han%2C+Y">Youfang Han</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+W">Wei Zhao</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+X">Xiangjin Chen</a>, <a href="/search/eess?searchtype=author&query=Meng%2C+X">Xin Meng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.06998v2-abstract-short" style="display: inline;"> Gesture recognition on wearable devices is extensively applied in human-computer interaction. Electromyography (EMG) has been used in many gesture recognition systems for its rapid perception of muscle signals. However, analyzing EMG signals on devices, like smart wristbands, usually needs inference models to have high performances, such as low inference latency, low power consumption, and low mem… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.06998v2-abstract-full').style.display = 'inline'; document.getElementById('2403.06998v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.06998v2-abstract-full" style="display: none;"> Gesture recognition on wearable devices is extensively applied in human-computer interaction. Electromyography (EMG) has been used in many gesture recognition systems for its rapid perception of muscle signals. However, analyzing EMG signals on devices, like smart wristbands, usually needs inference models to have high performances, such as low inference latency, low power consumption, and low memory occupation. Therefore, this paper proposes an improved spiking neural network (SNN) to achieve these goals. We propose an adaptive multi-delta coding as a spiking coding method to improve recognition accuracy. We propose two additive solvers for SNN, which can reduce inference energy consumption and amount of parameters significantly, and improve the robustness of temporal differences. In addition, we propose a linear action detection method TAD-LIF, which is suitable for SNNs. TAD-LIF is an improved LIF neuron that can detect transient-state gestures quickly and accurately. We collected two datasets from 20 subjects including 6 micro gestures. The collection devices are two designed lightweight consumer-level sEMG wristbands (3 and 8 electrode channels respectively). Compared to CNN, FCN, and normal SNN-based methods, the proposed SNN has higher recognition accuracy. The accuracy of the proposed SNN is 83.85% and 93.52% on the two datasets respectively. In addition, the inference latency of the proposed SNN is about 1% of CNN, the power consumption is about 0.1% of CNN, and the memory occupation is about 20% of CNN. The proposed methods can be used for precise, high-speed, and low-power micro-gesture recognition tasks, and are suitable for consumer-level intelligent wearable devices, which is a general way to achieve ubiquitous computing. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.06998v2-abstract-full').style.display = 'none'; document.getElementById('2403.06998v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.17877">arXiv:2402.17877</a> <span> [<a href="https://arxiv.org/pdf/2402.17877">pdf</a>, <a href="https://arxiv.org/format/2402.17877">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Accelerated Real-time Cine and Flow under In-magnet Staged Exercise </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Chandrasekaran%2C+P">Preethi Chandrasekaran</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+C">Chong Chen</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+Y">Yingmin Liu</a>, <a href="/search/eess?searchtype=author&query=Arshad%2C+S+M">Syed Murtaza Arshad</a>, <a href="/search/eess?searchtype=author&query=Crabtree%2C+C">Christopher Crabtree</a>, <a href="/search/eess?searchtype=author&query=Tong%2C+M">Matthew Tong</a>, <a href="/search/eess?searchtype=author&query=Han%2C+Y">Yuchi Han</a>, <a href="/search/eess?searchtype=author&query=Ahmad%2C+R">Rizwan Ahmad</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.17877v2-abstract-short" style="display: inline;"> Background: Cardiovascular magnetic resonance imaging (CMR) is a wellestablished imaging tool for diagnosing and managing cardiac conditions. The integration of exercise stress with CMR (ExCMR) can enhance its diagnostic capacity. Despite recent advances in CMR technology, quantitative ExCMR during exercise remains technically challenging due to motion artifacts and limited spatial and temporal re… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.17877v2-abstract-full').style.display = 'inline'; document.getElementById('2402.17877v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.17877v2-abstract-full" style="display: none;"> Background: Cardiovascular magnetic resonance imaging (CMR) is a wellestablished imaging tool for diagnosing and managing cardiac conditions. The integration of exercise stress with CMR (ExCMR) can enhance its diagnostic capacity. Despite recent advances in CMR technology, quantitative ExCMR during exercise remains technically challenging due to motion artifacts and limited spatial and temporal resolution. Methods: This study investigated the feasibility of biventricular functional and hemodynamic assessment using real-time (RT) ExCMR during a staged exercise protocol in 24 healthy volunteers. We applied a coil reweighting technique and employed high acceleration rates to minimize motion blurring and artifacts. We further applied a beat-selection technique that identified beats from the endexpiratory phase to minimize the impact of respiration-induced through-plane motion. Additionally, results from six patients were presented to demonstrate clinical feasibility. Results: Our findings indicated a consistent decrease in end-systolic volume and stable end-diastolic volume across exercise intensities, leading to increased stroke volume and ejection fraction. The selection of end-expiratory beats enhanced the repeatability of cardiac function parameters, as shown by scan-rescan tests in nine volunteers. High scores from a blinded image quality assessment indicated that coil reweighting effectively minimized motion artifacts. Conclusions: This study demonstrated the feasibility of RT ExCMR with inmagnet exercise in healthy subjects and patients. Our results indicate that high acceleration rates, coil reweighting, and selection of respiratory phase-specific heartbeats enhance image quality and repeatability of quantitative RT ExCMR. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.17877v2-abstract-full').style.display = 'none'; document.getElementById('2402.17877v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.08121">arXiv:2401.08121</a> <span> [<a href="https://arxiv.org/pdf/2401.08121">pdf</a>, <a href="https://arxiv.org/format/2401.08121">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> CycLight: learning traffic signal cooperation with a cycle-level strategy </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Han%2C+G">Gengyue Han</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+X">Xiaohan Liu</a>, <a href="/search/eess?searchtype=author&query=Peng%2C+X">Xianyue Peng</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+H">Hao Wang</a>, <a href="/search/eess?searchtype=author&query=Han%2C+Y">Yu Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.08121v1-abstract-short" style="display: inline;"> This study introduces CycLight, a novel cycle-level deep reinforcement learning (RL) approach for network-level adaptive traffic signal control (NATSC) systems. Unlike most traditional RL-based traffic controllers that focus on step-by-step decision making, CycLight adopts a cycle-level strategy, optimizing cycle length and splits simultaneously using Parameterized Deep Q-Networks (PDQN) algorithm… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.08121v1-abstract-full').style.display = 'inline'; document.getElementById('2401.08121v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.08121v1-abstract-full" style="display: none;"> This study introduces CycLight, a novel cycle-level deep reinforcement learning (RL) approach for network-level adaptive traffic signal control (NATSC) systems. Unlike most traditional RL-based traffic controllers that focus on step-by-step decision making, CycLight adopts a cycle-level strategy, optimizing cycle length and splits simultaneously using Parameterized Deep Q-Networks (PDQN) algorithm. This cycle-level approach effectively reduces the computational burden associated with frequent data communication, meanwhile enhancing the practicality and safety of real-world applications. A decentralized framework is formulated for multi-agent cooperation, while attention mechanism is integrated to accurately assess the impact of the surroundings on the current intersection. CycLight is tested in a large synthetic traffic grid using the microscopic traffic simulation tool, SUMO. Experimental results not only demonstrate the superiority of CycLight over other state-of-the-art approaches but also showcase its robustness against information transmission delays. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.08121v1-abstract-full').style.display = 'none'; document.getElementById('2401.08121v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.17282">arXiv:2312.17282</a> <span> [<a href="https://arxiv.org/pdf/2312.17282">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Chaotic Dynamics">nlin.CD</span> </div> </div> <p class="title is-5 mathjax"> Nonlinear energy harvesting system with multiple stability </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Han%2C+Y">Yanwei Han</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Z">Zijian Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.17282v1-abstract-short" style="display: inline;"> The nonlinear energy harvesting systems of the forced vibration with an electron-mechanical coupling are widely used to capture ambient vibration energy and convert mechanical energy into electrical energy. However, the nonlinear response mechanism of the friction induced vibration (FIV) energy harvesting system with multiple stability and stick-slip motion is still unclear. In the current paper,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.17282v1-abstract-full').style.display = 'inline'; document.getElementById('2312.17282v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.17282v1-abstract-full" style="display: none;"> The nonlinear energy harvesting systems of the forced vibration with an electron-mechanical coupling are widely used to capture ambient vibration energy and convert mechanical energy into electrical energy. However, the nonlinear response mechanism of the friction induced vibration (FIV) energy harvesting system with multiple stability and stick-slip motion is still unclear. In the current paper, a novel nonlinear energy harvesting model with multiple stability of single-, double- and triple-well potential is proposed based on V-shaped structure spring and the belt conveying system. The dynamic equations for the energy harvesting system with multiple stability and self-excited friction are established by using Euler-Lagrangian equations. Secondly, the nonlinear restoring force, friction force, and potential energy surfaces for static characteristics of the energy harvesting system are obtained to show the nonlinear varying stiffness, multiple equilibrium points, discontinuous behaviors and multiple well response. Then, the equilibrium surface of bifurcation sets of the autonomous system is given to show the third-order quasi zero stiffness (QZS3), fifth-order quasi zero stiffness (QZS5), double well (DW) and triple well (TW). Furthermore, the response amplitudes of charge, current, voltage and power of the forced electron-mechanical coupled vibration system for QZS3, QZS5, DW and TW are analyzed by using the numerically solution. Finally, a prototype of FIV energy harvesting system is manufactured and the experimental system is setup. The experimental work of static restoring force, damping force and electrical output are well agreeable with the numerical results, which testified the proposed FIV energy harvesting model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.17282v1-abstract-full').style.display = 'none'; document.getElementById('2312.17282v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">29 Pages, 29 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 34-xx <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> J.2 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.16383">arXiv:2312.16383</a> <span> [<a href="https://arxiv.org/pdf/2312.16383">pdf</a>, <a href="https://arxiv.org/ps/2312.16383">ps</a>, <a href="https://arxiv.org/format/2312.16383">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Frame-level emotional state alignment method for speech emotion recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+Q">Qifei Li</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+Y">Yingming Gao</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+C">Cong Wang</a>, <a href="/search/eess?searchtype=author&query=Deng%2C+Y">Yayue Deng</a>, <a href="/search/eess?searchtype=author&query=Xue%2C+J">Jinlong Xue</a>, <a href="/search/eess?searchtype=author&query=Han%2C+Y">Yichen Han</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Ya Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.16383v1-abstract-short" style="display: inline;"> Speech emotion recognition (SER) systems aim to recognize human emotional state during human-computer interaction. Most existing SER systems are trained based on utterance-level labels. However, not all frames in an audio have affective states consistent with utterance-level label, which makes it difficult for the model to distinguish the true emotion of the audio and perform poorly. To address th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.16383v1-abstract-full').style.display = 'inline'; document.getElementById('2312.16383v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.16383v1-abstract-full" style="display: none;"> Speech emotion recognition (SER) systems aim to recognize human emotional state during human-computer interaction. Most existing SER systems are trained based on utterance-level labels. However, not all frames in an audio have affective states consistent with utterance-level label, which makes it difficult for the model to distinguish the true emotion of the audio and perform poorly. To address this problem, we propose a frame-level emotional state alignment method for SER. First, we fine-tune HuBERT model to obtain a SER system with task-adaptive pretraining (TAPT) method, and extract embeddings from its transformer layers to form frame-level pseudo-emotion labels with clustering. Then, the pseudo labels are used to pretrain HuBERT. Hence, the each frame output of HuBERT has corresponding emotional information. Finally, we fine-tune the above pretrained HuBERT for SER by adding an attention layer on the top of it, which can focus only on those frames that are emotionally more consistent with utterance-level label. The experimental results performed on IEMOCAP indicate that our proposed method performs better than state-of-the-art (SOTA) methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.16383v1-abstract-full').style.display = 'none'; document.getElementById('2312.16383v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICASSP 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.10112">arXiv:2312.10112</a> <span> [<a href="https://arxiv.org/pdf/2312.10112">pdf</a>, <a href="https://arxiv.org/format/2312.10112">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> NM-FlowGAN: Modeling sRGB Noise without Paired Images using a Hybrid Approach of Normalizing Flows and GAN </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Han%2C+Y+J">Young Joo Han</a>, <a href="/search/eess?searchtype=author&query=Yu%2C+H">Ha-Jin Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.10112v3-abstract-short" style="display: inline;"> Modeling and synthesizing real sRGB noise is crucial for various low-level vision tasks, such as building datasets for training image denoising systems. The distribution of real sRGB noise is highly complex and affected by a multitude of factors, making its accurate modeling extremely challenging. Therefore, recent studies have proposed methods that employ data-driven generative models, such as Ge… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.10112v3-abstract-full').style.display = 'inline'; document.getElementById('2312.10112v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.10112v3-abstract-full" style="display: none;"> Modeling and synthesizing real sRGB noise is crucial for various low-level vision tasks, such as building datasets for training image denoising systems. The distribution of real sRGB noise is highly complex and affected by a multitude of factors, making its accurate modeling extremely challenging. Therefore, recent studies have proposed methods that employ data-driven generative models, such as Generative Adversarial Networks (GAN) and Normalizing Flows. These studies achieve more accurate modeling of sRGB noise compared to traditional noise modeling methods. However, there are performance limitations due to the inherent characteristics of each generative model. To address this issue, we propose NM-FlowGAN, a hybrid approach that exploits the strengths of both GAN and Normalizing Flows. We combine pixel-wise noise modeling networks based on Normalizing Flows and spatial correlation modeling networks based on GAN. Specifically, the pixel-wise noise modeling network leverages the high training stability of Normalizing Flows to capture noise characteristics that are affected by a multitude of factors, and the spatial correlation networks efficiently model pixel-to-pixel relationships. In particular, unlike recent methods that rely on paired noisy images, our method synthesizes noise using clean images and factors that affect noise characteristics, such as easily obtainable parameters like camera type and ISO settings, making it applicable to various fields where obtaining noisy-clean image pairs is not feasible. In our experiments, our NM-FlowGAN outperforms other baselines in the sRGB noise synthesis task. Moreover, the denoising neural network trained with synthesized image pairs from our model shows superior performance compared to other baselines. Our code is available at: \url{https://github.com/YoungJooHan/NM-FlowGAN}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.10112v3-abstract-full').style.display = 'none'; document.getElementById('2312.10112v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 10 figures, 8 tables</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68T45 <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.4.4 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.11044">arXiv:2310.11044</a> <span> [<a href="https://arxiv.org/pdf/2310.11044">pdf</a>, <a href="https://arxiv.org/ps/2310.11044">ps</a>, <a href="https://arxiv.org/format/2310.11044">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> A Tutorial on Near-Field XL-MIMO Communications Towards 6G </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Lu%2C+H">Haiquan Lu</a>, <a href="/search/eess?searchtype=author&query=Zeng%2C+Y">Yong Zeng</a>, <a href="/search/eess?searchtype=author&query=You%2C+C">Changsheng You</a>, <a href="/search/eess?searchtype=author&query=Han%2C+Y">Yu Han</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+J">Jiayi Zhang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Z">Zhe Wang</a>, <a href="/search/eess?searchtype=author&query=Dong%2C+Z">Zhenjun Dong</a>, <a href="/search/eess?searchtype=author&query=Jin%2C+S">Shi Jin</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+C">Cheng-Xiang Wang</a>, <a href="/search/eess?searchtype=author&query=Jiang%2C+T">Tao Jiang</a>, <a href="/search/eess?searchtype=author&query=You%2C+X">Xiaohu You</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+R">Rui Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.11044v3-abstract-short" style="display: inline;"> Extremely large-scale multiple-input multiple-output (XL-MIMO) is a promising technology for the sixth-generation (6G) mobile communication networks. By significantly boosting the antenna number or size to at least an order of magnitude beyond current massive MIMO systems, XL-MIMO is expected to unprecedentedly enhance the spectral efficiency and spatial resolution for wireless communication. The… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.11044v3-abstract-full').style.display = 'inline'; document.getElementById('2310.11044v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.11044v3-abstract-full" style="display: none;"> Extremely large-scale multiple-input multiple-output (XL-MIMO) is a promising technology for the sixth-generation (6G) mobile communication networks. By significantly boosting the antenna number or size to at least an order of magnitude beyond current massive MIMO systems, XL-MIMO is expected to unprecedentedly enhance the spectral efficiency and spatial resolution for wireless communication. The evolution from massive MIMO to XL-MIMO is not simply an increase in the array size, but faces new design challenges, in terms of near-field channel modelling, performance analysis, channel estimation, and practical implementation. In this article, we give a comprehensive tutorial overview on near-field XL-MIMO communications, aiming to provide useful guidance for tackling the above challenges. First, the basic near-field modelling for XL-MIMO is established, by considering the new characteristics of non-uniform spherical wave (NUSW) and spatial non-stationarity. Next, based on the near-field modelling, the performance analysis of XL-MIMO is presented, including the near-field signal-to-noise ratio (SNR) scaling laws, beam focusing pattern, achievable rate, and degrees-of-freedom (DoF). Furthermore, various XL-MIMO design issues such as near-field beam codebook, beam training, channel estimation, and delay alignment modulation (DAM) transmission are elaborated. Finally, we point out promising directions to inspire future research on near-field XL-MIMO communications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.11044v3-abstract-full').style.display = 'none'; document.getElementById('2310.11044v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">42 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.07464">arXiv:2310.07464</a> <span> [<a href="https://arxiv.org/pdf/2310.07464">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> </div> </div> <p class="title is-5 mathjax"> Deep Learning Predicts Biomarker Status and Discovers Related Histomorphology Characteristics for Low-Grade Glioma </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Fang%2C+Z">Zijie Fang</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+Y">Yihan Liu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yifeng Wang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+X">Xiangyang Zhang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yang Chen</a>, <a href="/search/eess?searchtype=author&query=Cai%2C+C">Changjing Cai</a>, <a href="/search/eess?searchtype=author&query=Lin%2C+Y">Yiyang Lin</a>, <a href="/search/eess?searchtype=author&query=Han%2C+Y">Ying Han</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Z">Zhi Wang</a>, <a href="/search/eess?searchtype=author&query=Zeng%2C+S">Shan Zeng</a>, <a href="/search/eess?searchtype=author&query=Shen%2C+H">Hong Shen</a>, <a href="/search/eess?searchtype=author&query=Tan%2C+J">Jun Tan</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Y">Yongbing Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.07464v1-abstract-short" style="display: inline;"> Biomarker detection is an indispensable part in the diagnosis and treatment of low-grade glioma (LGG). However, current LGG biomarker detection methods rely on expensive and complex molecular genetic testing, for which professionals are required to analyze the results, and intra-rater variability is often reported. To overcome these challenges, we propose an interpretable deep learning pipeline, a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.07464v1-abstract-full').style.display = 'inline'; document.getElementById('2310.07464v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.07464v1-abstract-full" style="display: none;"> Biomarker detection is an indispensable part in the diagnosis and treatment of low-grade glioma (LGG). However, current LGG biomarker detection methods rely on expensive and complex molecular genetic testing, for which professionals are required to analyze the results, and intra-rater variability is often reported. To overcome these challenges, we propose an interpretable deep learning pipeline, a Multi-Biomarker Histomorphology Discoverer (Multi-Beholder) model based on the multiple instance learning (MIL) framework, to predict the status of five biomarkers in LGG using only hematoxylin and eosin-stained whole slide images and slide-level biomarker status labels. Specifically, by incorporating the one-class classification into the MIL framework, accurate instance pseudo-labeling is realized for instance-level supervision, which greatly complements the slide-level labels and improves the biomarker prediction performance. Multi-Beholder demonstrates superior prediction performance and generalizability for five LGG biomarkers (AUROC=0.6469-0.9735) in two cohorts (n=607) with diverse races and scanning protocols. Moreover, the excellent interpretability of Multi-Beholder allows for discovering the quantitative and qualitative correlations between biomarker status and histomorphology characteristics. Our pipeline not only provides a novel approach for biomarker prediction, enhancing the applicability of molecular treatments for LGG patients but also facilitates the discovery of new mechanisms in molecular functionality and LGG progression. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.07464v1-abstract-full').style.display = 'none'; document.getElementById('2310.07464v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">47 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.16128">arXiv:2309.16128</a> <span> [<a href="https://arxiv.org/pdf/2309.16128">pdf</a>, <a href="https://arxiv.org/format/2309.16128">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Joint Correcting and Refinement for Balanced Low-Light Image Enhancement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yu%2C+N">Nana Yu</a>, <a href="/search/eess?searchtype=author&query=Shi%2C+H">Hong Shi</a>, <a href="/search/eess?searchtype=author&query=Han%2C+Y">Yahong Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.16128v2-abstract-short" style="display: inline;"> Low-light image enhancement tasks demand an appropriate balance among brightness, color, and illumination. While existing methods often focus on one aspect of the image without considering how to pay attention to this balance, which will cause problems of color distortion and overexposure etc. This seriously affects both human visual perception and the performance of high-level visual models. In t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.16128v2-abstract-full').style.display = 'inline'; document.getElementById('2309.16128v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.16128v2-abstract-full" style="display: none;"> Low-light image enhancement tasks demand an appropriate balance among brightness, color, and illumination. While existing methods often focus on one aspect of the image without considering how to pay attention to this balance, which will cause problems of color distortion and overexposure etc. This seriously affects both human visual perception and the performance of high-level visual models. In this work, a novel synergistic structure is proposed which can balance brightness, color, and illumination more effectively. Specifically, the proposed method, so-called Joint Correcting and Refinement Network (JCRNet), which mainly consists of three stages to balance brightness, color, and illumination of enhancement. Stage 1: we utilize a basic encoder-decoder and local supervision mechanism to extract local information and more comprehensive details for enhancement. Stage 2: cross-stage feature transmission and spatial feature transformation further facilitate color correction and feature refinement. Stage 3: we employ a dynamic illumination adjustment approach to embed residuals between predicted and ground truth images into the model, adaptively adjusting illumination balance. Extensive experiments demonstrate that the proposed method exhibits comprehensive performance advantages over 21 state-of-the-art methods on 9 benchmark datasets. Furthermore, a more persuasive experiment has been conducted to validate our approach the effectiveness in downstream visual tasks (e.g., saliency detection). Compared to several enhancement models, the proposed method effectively improves the segmentation results and quantitative metrics of saliency detection. The source code will be available at https://github.com/woshiyll/JCRNet. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.16128v2-abstract-full').style.display = 'none'; document.getElementById('2309.16128v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.11977">arXiv:2309.11977</a> <span> [<a href="https://arxiv.org/pdf/2309.11977">pdf</a>, <a href="https://arxiv.org/format/2309.11977">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Improving Language Model-Based Zero-Shot Text-to-Speech Synthesis with Multi-Scale Acoustic Prompts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Lei%2C+S">Shun Lei</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+Y">Yixuan Zhou</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+L">Liyang Chen</a>, <a href="/search/eess?searchtype=author&query=Luo%2C+D">Dan Luo</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+Z">Zhiyong Wu</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+X">Xixin Wu</a>, <a href="/search/eess?searchtype=author&query=Kang%2C+S">Shiyin Kang</a>, <a href="/search/eess?searchtype=author&query=Jiang%2C+T">Tao Jiang</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+Y">Yahui Zhou</a>, <a href="/search/eess?searchtype=author&query=Han%2C+Y">Yuxing Han</a>, <a href="/search/eess?searchtype=author&query=Meng%2C+H">Helen Meng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.11977v3-abstract-short" style="display: inline;"> Zero-shot text-to-speech (TTS) synthesis aims to clone any unseen speaker's voice without adaptation parameters. By quantizing speech waveform into discrete acoustic tokens and modeling these tokens with the language model, recent language model-based TTS models show zero-shot speaker adaptation capabilities with only a 3-second acoustic prompt of an unseen speaker. However, they are limited by th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.11977v3-abstract-full').style.display = 'inline'; document.getElementById('2309.11977v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.11977v3-abstract-full" style="display: none;"> Zero-shot text-to-speech (TTS) synthesis aims to clone any unseen speaker's voice without adaptation parameters. By quantizing speech waveform into discrete acoustic tokens and modeling these tokens with the language model, recent language model-based TTS models show zero-shot speaker adaptation capabilities with only a 3-second acoustic prompt of an unseen speaker. However, they are limited by the length of the acoustic prompt, which makes it difficult to clone personal speaking style. In this paper, we propose a novel zero-shot TTS model with the multi-scale acoustic prompts based on a neural codec language model VALL-E. A speaker-aware text encoder is proposed to learn the personal speaking style at the phoneme-level from the style prompt consisting of multiple sentences. Following that, a VALL-E based acoustic decoder is utilized to model the timbre from the timbre prompt at the frame-level and generate speech. The experimental results show that our proposed method outperforms baselines in terms of naturalness and speaker similarity, and can achieve better performance by scaling out to a longer style prompt. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.11977v3-abstract-full').style.display = 'none'; document.getElementById('2309.11977v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted bt ICASSP 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.03686">arXiv:2309.03686</a> <span> [<a href="https://arxiv.org/pdf/2309.03686">pdf</a>, <a href="https://arxiv.org/format/2309.03686">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MS-UNet-v2: Adaptive Denoising Method and Training Strategy for Medical Image Segmentation with Small Training Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Chen%2C+H">Haoyuan Chen</a>, <a href="/search/eess?searchtype=author&query=Han%2C+Y">Yufei Han</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+P">Pin Xu</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yanyi Li</a>, <a href="/search/eess?searchtype=author&query=Li%2C+K">Kuan Li</a>, <a href="/search/eess?searchtype=author&query=Yin%2C+J">Jianping Yin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.03686v1-abstract-short" style="display: inline;"> Models based on U-like structures have improved the performance of medical image segmentation. However, the single-layer decoder structure of U-Net is too "thin" to exploit enough information, resulting in large semantic differences between the encoder and decoder parts. Things get worse if the number of training sets of data is not sufficiently large, which is common in medical image processing t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.03686v1-abstract-full').style.display = 'inline'; document.getElementById('2309.03686v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.03686v1-abstract-full" style="display: none;"> Models based on U-like structures have improved the performance of medical image segmentation. However, the single-layer decoder structure of U-Net is too "thin" to exploit enough information, resulting in large semantic differences between the encoder and decoder parts. Things get worse if the number of training sets of data is not sufficiently large, which is common in medical image processing tasks where annotated data are more difficult to obtain than other tasks. Based on this observation, we propose a novel U-Net model named MS-UNet for the medical image segmentation task in this study. Instead of the single-layer U-Net decoder structure used in Swin-UNet and TransUnet, we specifically design a multi-scale nested decoder based on the Swin Transformer for U-Net. The proposed multi-scale nested decoder structure allows the feature mapping between the decoder and encoder to be semantically closer, thus enabling the network to learn more detailed features. In addition, we propose a novel edge loss and a plug-and-play fine-tuning Denoising module, which not only effectively improves the segmentation performance of MS-UNet, but could also be applied to other models individually. Experimental results show that MS-UNet could effectively improve the network performance with more efficient feature learning capability and exhibit more advanced performance, especially in the extreme case with a small amount of training data, and the proposed Edge loss and Denoising module could significantly enhance the segmentation performance of MS-UNet. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.03686v1-abstract-full').style.display = 'none'; document.getElementById('2309.03686v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.03451">arXiv:2309.03451</a> <span> [<a href="https://arxiv.org/pdf/2309.03451">pdf</a>, <a href="https://arxiv.org/format/2309.03451">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Cross-domain Sound Recognition for Efficient Underwater Data Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Park%2C+J">Jeongsoo Park</a>, <a href="/search/eess?searchtype=author&query=Han%2C+D">Dong-Gyun Han</a>, <a href="/search/eess?searchtype=author&query=La%2C+H+S">Hyoung Sul La</a>, <a href="/search/eess?searchtype=author&query=Lee%2C+S">Sangmin Lee</a>, <a href="/search/eess?searchtype=author&query=Han%2C+Y">Yoonchang Han</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+E">Eun-Jin Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.03451v2-abstract-short" style="display: inline;"> This paper presents a novel deep learning approach for analyzing massive underwater acoustic data by leveraging a model trained on a broad spectrum of non-underwater (aerial) sounds. Recognizing the challenge in labeling vast amounts of underwater data, we propose a two-fold methodology to accelerate this labor-intensive procedure. The first part of our approach involves PCA and UMAP visualizati… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.03451v2-abstract-full').style.display = 'inline'; document.getElementById('2309.03451v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.03451v2-abstract-full" style="display: none;"> This paper presents a novel deep learning approach for analyzing massive underwater acoustic data by leveraging a model trained on a broad spectrum of non-underwater (aerial) sounds. Recognizing the challenge in labeling vast amounts of underwater data, we propose a two-fold methodology to accelerate this labor-intensive procedure. The first part of our approach involves PCA and UMAP visualization of the underwater data using the feature vectors of an aerial sound recognition model. This enables us to cluster the data in a two dimensional space and listen to points within these clusters to understand their defining characteristics. This innovative method simplifies the process of selecting candidate labels for further training. In the second part, we train a neural network model using both the selected underwater data and the non-underwater dataset. We conducted a quantitative analysis to measure the precision, recall, and F1 score of our model for recognizing airgun sounds, a common type of underwater sound. The F1 score achieved by our model exceeded 84.3%, demonstrating the effectiveness of our approach in analyzing underwater acoustic data. The methodology presented in this paper holds significant potential to reduce the amount of labor required in underwater data analysis and opens up new possibilities for further research in the field of cross-domain data analysis. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.03451v2-abstract-full').style.display = 'none'; document.getElementById('2309.03451v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to APSIPA 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.15752">arXiv:2308.15752</a> <span> [<a href="https://arxiv.org/pdf/2308.15752">pdf</a>, <a href="https://arxiv.org/format/2308.15752">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Large-scale data extraction from the UNOS organ donor documents </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Rychlik%2C+M">Marek Rychlik</a>, <a href="/search/eess?searchtype=author&query=Tanriover%2C+B">Bekir Tanriover</a>, <a href="/search/eess?searchtype=author&query=Han%2C+Y">Yan Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.15752v3-abstract-short" style="display: inline;"> In this paper we focus on three major task: 1) discussing our methods: Our method captures a portion of the data in DCD flowsheets, kidney perfusion data, and Flowsheet data captured peri-organ recovery surgery. 2) demonstrating the result: We built a comprehensive, analyzable database from 2022 OPTN data. This dataset is by far larger than any previously available even in this preliminary phase;… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.15752v3-abstract-full').style.display = 'inline'; document.getElementById('2308.15752v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.15752v3-abstract-full" style="display: none;"> In this paper we focus on three major task: 1) discussing our methods: Our method captures a portion of the data in DCD flowsheets, kidney perfusion data, and Flowsheet data captured peri-organ recovery surgery. 2) demonstrating the result: We built a comprehensive, analyzable database from 2022 OPTN data. This dataset is by far larger than any previously available even in this preliminary phase; and 3) proving that our methods can be extended to all the past OPTN data and future data. The scope of our study is all Organ Procurement and Transplantation Network (OPTN) data of the USA organ donors since 2008. The data was not analyzable in a large scale in the past because it was captured in PDF documents known as ``Attachments'', whereby every donor's information was recorded into dozens of PDF documents in heterogeneous formats. To make the data analyzable, one needs to convert the content inside these PDFs to an analyzable data format, such as a standard SQL database. In this paper we will focus on 2022 OPTN data, which consists of $\approx 400,000$ PDF documents spanning millions of pages. The entire OPTN data covers 15 years (2008--20022). This paper assumes that readers are familiar with the content of the OPTN data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.15752v3-abstract-full').style.display = 'none'; document.getElementById('2308.15752v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 62; 68 <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.5.4 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.12985">arXiv:2308.12985</a> <span> [<a href="https://arxiv.org/pdf/2308.12985">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Perimeter Control with Heterogeneous Metering Rates for Cordon Signals: A Physics-Regularized Multi-Agent Reinforcement Learning Approach </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yu%2C+J">Jiajie Yu</a>, <a href="/search/eess?searchtype=author&query=Laharotte%2C+P">Pierre-Antoine Laharotte</a>, <a href="/search/eess?searchtype=author&query=Han%2C+Y">Yu Han</a>, <a href="/search/eess?searchtype=author&query=Ma%2C+W">Wei Ma</a>, <a href="/search/eess?searchtype=author&query=Leclercq%2C+L">Ludovic Leclercq</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.12985v2-abstract-short" style="display: inline;"> Perimeter Control (PC) strategies have been proposed to address urban road network control in oversaturated situations by regulating the transfer flow of the Protected Network (PN) based on the Macroscopic Fundamental Diagram (MFD). The uniform metering rate for cordon signals in most existing studies overlooks the variance of local traffic states at the intersection level, which may cause severe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.12985v2-abstract-full').style.display = 'inline'; document.getElementById('2308.12985v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.12985v2-abstract-full" style="display: none;"> Perimeter Control (PC) strategies have been proposed to address urban road network control in oversaturated situations by regulating the transfer flow of the Protected Network (PN) based on the Macroscopic Fundamental Diagram (MFD). The uniform metering rate for cordon signals in most existing studies overlooks the variance of local traffic states at the intersection level, which may cause severe local traffic congestion and degradation of the network stability. PC strategies with heterogeneous metering rates for cordon signals allow precise control for the perimeter but the complexity of the problem increases exponentially with the scale of the PN. This paper leverages a Multi-Agent Reinforcement Learning (MARL)-based traffic signal control framework to decompose this PC problem, which considers heterogeneous metering rates for cordon signals, into multi-agent cooperation tasks. Each agent controls an individual signal located in the cordon, decreasing the dimension of action space for the controller compared to centralized methods. A physics regularization approach for the MARL framework is proposed to ensure the distributed cordon signal controllers are aware of the global network state by encoding MFD-based knowledge into the action-value functions of the local agents. The proposed PC strategy is operated as a two-stage system, with a feedback PC strategy detecting the overall traffic state within the PN and then distributing local instructions to cordon signals controllers in the MARL framework via the physics regularization. Through numerical tests with different demand patterns in a microscopic traffic environment, the proposed PC strategy shows promising robustness and transferability. It outperforms state-of-the-art feedback PC strategies in increasing network throughput, decreasing distributed delay for gate links, and reducing carbon emissions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.12985v2-abstract-full').style.display = 'none'; document.getElementById('2308.12985v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">21 pages, 24 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.02088">arXiv:2308.02088</a> <span> [<a href="https://arxiv.org/pdf/2308.02088">pdf</a>, <a href="https://arxiv.org/format/2308.02088">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1002/mrm.30123">10.1002/mrm.30123 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Motion-robust free-running volumetric cardiovascular MRI </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Arshad%2C+S+M">Syed M. Arshad</a>, <a href="/search/eess?searchtype=author&query=Potter%2C+L+C">Lee C. Potter</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+C">Chong Chen</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+Y">Yingmin Liu</a>, <a href="/search/eess?searchtype=author&query=Chandrasekaran%2C+P">Preethi Chandrasekaran</a>, <a href="/search/eess?searchtype=author&query=Crabtree%2C+C">Christopher Crabtree</a>, <a href="/search/eess?searchtype=author&query=Tong%2C+M+S">Matthew S. Tong</a>, <a href="/search/eess?searchtype=author&query=Simonetti%2C+O+P">Orlando P. Simonetti</a>, <a href="/search/eess?searchtype=author&query=Han%2C+Y">Yuchi Han</a>, <a href="/search/eess?searchtype=author&query=Ahmad%2C+R">Rizwan Ahmad</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.02088v3-abstract-short" style="display: inline;"> PURPOSE: To present and assess an outlier mitigation method that makes free-running volumetric cardiovascular MRI (CMR) more robust to motion. METHODS: The proposed method, called compressive recovery with outlier rejection (CORe), models outliers in the measured data as an additive auxiliary variable. We enforce MR physics-guided group sparsity on the auxiliary variable, and jointly estimate it… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.02088v3-abstract-full').style.display = 'inline'; document.getElementById('2308.02088v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.02088v3-abstract-full" style="display: none;"> PURPOSE: To present and assess an outlier mitigation method that makes free-running volumetric cardiovascular MRI (CMR) more robust to motion. METHODS: The proposed method, called compressive recovery with outlier rejection (CORe), models outliers in the measured data as an additive auxiliary variable. We enforce MR physics-guided group sparsity on the auxiliary variable, and jointly estimate it along with the image using an iterative algorithm. For evaluation, CORe is first compared to traditional compressed sensing (CS), robust regression (RR), and an existing outlier rejection method using two simulation studies. Then, CORe is compared to CS using seven three-dimensional (3D) cine, 12 rest four-dimensional (4D) flow, and eight stress 4D flow imaging datasets. RESULTS: Our simulation studies show that CORe outperforms CS, RR, and the existing outlier rejection method in terms of normalized mean square error and structural similarity index across 55 different realizations. The expert reader evaluation of 3D cine images demonstrates that CORe is more effective in suppressing artifacts while maintaining or improving image sharpness. Finally, 4D flow images show that CORe yields more reliable and consistent flow measurements, especially in the presence of involuntary subject motion or exercise stress. CONCLUSION: An outlier rejection method is presented and tested using simulated and measured data. This method can help suppress motion artifacts in a wide range of free-running CMR applications. CODE & DATA: Implementation code and datasets are available on GitHub at http://github.com/OSU-MR/motion-robust-CMR <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.02088v3-abstract-full').style.display = 'none'; document.getElementById('2308.02088v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Magnetic Resonance in Medicine 92(3) (2024) 1248-1262 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2307.13237">arXiv:2307.13237</a> <span> [<a href="https://arxiv.org/pdf/2307.13237">pdf</a>, <a href="https://arxiv.org/ps/2307.13237">ps</a>, <a href="https://arxiv.org/format/2307.13237">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/LWC.2023.3331489">10.1109/LWC.2023.3331489 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Rank Optimization for MIMO Channel with RIS: Simulation and Measurement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Meng%2C+S">Shengguo Meng</a>, <a href="/search/eess?searchtype=author&query=Tang%2C+W">Wankai Tang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+W">Weicong Chen</a>, <a href="/search/eess?searchtype=author&query=Lan%2C+J">Jifeng Lan</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+Q+Y">Qun Yan Zhou</a>, <a href="/search/eess?searchtype=author&query=Han%2C+Y">Yu Han</a>, <a href="/search/eess?searchtype=author&query=Li%2C+X">Xiao Li</a>, <a href="/search/eess?searchtype=author&query=Jin%2C+S">Shi Jin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2307.13237v2-abstract-short" style="display: inline;"> Reconfigurable intelligent surface (RIS) is a promising technology that can reshape the electromagnetic environment in wireless networks, offering various possibilities for enhancing wireless channels. Motivated by this, we investigate the channel optimization for multiple-input multiple-output (MIMO) systems assisted by RIS. In this paper, an efficient RIS optimization method is proposed to enhan… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.13237v2-abstract-full').style.display = 'inline'; document.getElementById('2307.13237v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2307.13237v2-abstract-full" style="display: none;"> Reconfigurable intelligent surface (RIS) is a promising technology that can reshape the electromagnetic environment in wireless networks, offering various possibilities for enhancing wireless channels. Motivated by this, we investigate the channel optimization for multiple-input multiple-output (MIMO) systems assisted by RIS. In this paper, an efficient RIS optimization method is proposed to enhance the effective rank of the MIMO channel for achievable rate improvement. Numerical results are presented to verify the effectiveness of RIS in improving MIMO channels. Additionally, we construct a 2$\times$2 RIS-assisted MIMO prototype to perform experimental measurements and validate the performance of our proposed algorithm. The results reveal a significant increase in effective rank and achievable rate for the RIS-assisted MIMO channel compared to the MIMO channel without RIS. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.13237v2-abstract-full').style.display = 'none'; document.getElementById('2307.13237v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This work has been accepted by IEEE WCL</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2307.09823">arXiv:2307.09823</a> <span> [<a href="https://arxiv.org/pdf/2307.09823">pdf</a>, <a href="https://arxiv.org/format/2307.09823">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Multi-modal Learning based Prediction for Disease </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yaran Chen</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+X">Xueyu Chen</a>, <a href="/search/eess?searchtype=author&query=Han%2C+Y">Yu Han</a>, <a href="/search/eess?searchtype=author&query=Li%2C+H">Haoran Li</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+D">Dongbin Zhao</a>, <a href="/search/eess?searchtype=author&query=Li%2C+J">Jingzhong Li</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+X">Xu Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2307.09823v1-abstract-short" style="display: inline;"> Non alcoholic fatty liver disease (NAFLD) is the most common cause of chronic liver disease, which can be predicted accurately to prevent advanced fibrosis and cirrhosis. While, a liver biopsy, the gold standard for NAFLD diagnosis, is invasive, expensive, and prone to sampling errors. Therefore, non-invasive studies are extremely promising, yet they are still in their infancy due to the lack of c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.09823v1-abstract-full').style.display = 'inline'; document.getElementById('2307.09823v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2307.09823v1-abstract-full" style="display: none;"> Non alcoholic fatty liver disease (NAFLD) is the most common cause of chronic liver disease, which can be predicted accurately to prevent advanced fibrosis and cirrhosis. While, a liver biopsy, the gold standard for NAFLD diagnosis, is invasive, expensive, and prone to sampling errors. Therefore, non-invasive studies are extremely promising, yet they are still in their infancy due to the lack of comprehensive research data and intelligent methods for multi-modal data. This paper proposes a NAFLD diagnosis system (DeepFLDDiag) combining a comprehensive clinical dataset (FLDData) and a multi-modal learning based NAFLD prediction method (DeepFLD). The dataset includes over 6000 participants physical examinations, laboratory and imaging studies, extensive questionnaires, and facial images of partial participants, which is comprehensive and valuable for clinical studies. From the dataset, we quantitatively analyze and select clinical metadata that most contribute to NAFLD prediction. Furthermore, the proposed DeepFLD, a deep neural network model designed to predict NAFLD using multi-modal input, including metadata and facial images, outperforms the approach that only uses metadata. Satisfactory performance is also verified on other unseen datasets. Inspiringly, DeepFLD can achieve competitive results using only facial images as input rather than metadata, paving the way for a more robust and simpler non-invasive NAFLD diagnosis. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.09823v1-abstract-full').style.display = 'none'; document.getElementById('2307.09823v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.07650">arXiv:2306.07650</a> <span> [<a href="https://arxiv.org/pdf/2306.07650">pdf</a>, <a href="https://arxiv.org/format/2306.07650">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Modality Adaption or Regularization? A Case Study on End-to-End Speech Translation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Han%2C+Y">Yuchen Han</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+C">Chen Xu</a>, <a href="/search/eess?searchtype=author&query=Xiao%2C+T">Tong Xiao</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+J">Jingbo Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.07650v1-abstract-short" style="display: inline;"> Pre-training and fine-tuning is a paradigm for alleviating the data scarcity problem in end-to-end speech translation (E2E ST). The commonplace "modality gap" between speech and text data often leads to inconsistent inputs between pre-training and fine-tuning. However, we observe that this gap occurs in the early stages of fine-tuning, but does not have a major impact on the final performance. On… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.07650v1-abstract-full').style.display = 'inline'; document.getElementById('2306.07650v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.07650v1-abstract-full" style="display: none;"> Pre-training and fine-tuning is a paradigm for alleviating the data scarcity problem in end-to-end speech translation (E2E ST). The commonplace "modality gap" between speech and text data often leads to inconsistent inputs between pre-training and fine-tuning. However, we observe that this gap occurs in the early stages of fine-tuning, but does not have a major impact on the final performance. On the other hand, we find that there has another gap, which we call the "capacity gap": high resource tasks (such as ASR and MT) always require a large model to fit, when the model is reused for a low resource task (E2E ST), it will get a sub-optimal performance due to the over-fitting. In a case study, we find that the regularization plays a more important role than the well-designed modality adaption method, which achieves 29.0 for en-de and 40.3 for en-fr on the MuST-C dataset. Code and models are available at https://github.com/hannlp/TAB. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.07650v1-abstract-full').style.display = 'none'; document.getElementById('2306.07650v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ACL 2023 Main Conference</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2304.14467">arXiv:2304.14467</a> <span> [<a href="https://arxiv.org/pdf/2304.14467">pdf</a>, <a href="https://arxiv.org/format/2304.14467">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Distributed Quantized Detection of Sparse Signals Under Byzantine Attacks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Quan%2C+C">Chen Quan</a>, <a href="/search/eess?searchtype=author&query=Han%2C+Y+S">Yunghsiang S. Han</a>, <a href="/search/eess?searchtype=author&query=Geng%2C+B">Baocheng Geng</a>, <a href="/search/eess?searchtype=author&query=Varshney%2C+P+K">Pramod K. Varshney</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2304.14467v1-abstract-short" style="display: inline;"> This paper investigates distributed detection of sparse stochastic signals with quantized measurements under Byzantine attacks. Under this type of attack, sensors in the networks might send falsified data to degrade system performance. The Bernoulli-Gaussian (BG) distribution in terms of the sparsity degree of the stochastic signal is utilized for modeling the sparsity of signals. Several detector… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.14467v1-abstract-full').style.display = 'inline'; document.getElementById('2304.14467v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2304.14467v1-abstract-full" style="display: none;"> This paper investigates distributed detection of sparse stochastic signals with quantized measurements under Byzantine attacks. Under this type of attack, sensors in the networks might send falsified data to degrade system performance. The Bernoulli-Gaussian (BG) distribution in terms of the sparsity degree of the stochastic signal is utilized for modeling the sparsity of signals. Several detectors with improved detection performance are proposed by incorporating the estimated attack parameters into the detection process. First, we propose the generalized likelihood ratio test with reference sensors (GLRTRS) and the locally most powerful test with reference sensors (LMPTRS) detectors with adaptive thresholds, given that the sparsity degree and the attack parameters are unknown. Our simulation results show that the LMPTRS and GLRTRS detectors outperform the LMPT and GLRT detectors proposed for an attack-free environment and are more robust against attacks. The proposed detectors can achieve the detection performance close to the benchmark likelihood ratio test (LRT) detector, which has perfect knowledge of the attack parameters and sparsity degree. When the fraction of Byzantine nodes are assumed to be known, we can further improve the system's detection performance. We propose the enhanced LMPTRS (E-LMPTRS) and enhanced GLRTRS (E-GLRTRS) detectors by filtering out potential malicious sensors with the knowledge of the fraction of Byzantine nodes in the network. Simulation results show the superiority of proposed enhanced detectors over LMPTRS and GLRTRS detectors. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.14467v1-abstract-full').style.display = 'none'; document.getElementById('2304.14467v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 April, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2301.10815">arXiv:2301.10815</a> <span> [<a href="https://arxiv.org/pdf/2301.10815">pdf</a>, <a href="https://arxiv.org/format/2301.10815">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Human-machine Hierarchical Networks for Decision Making under Byzantine Attacks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Quan%2C+C">Chen Quan</a>, <a href="/search/eess?searchtype=author&query=Geng%2C+B">Baocheng Geng</a>, <a href="/search/eess?searchtype=author&query=Han%2C+Y+S">Yunghsiang S. Han</a>, <a href="/search/eess?searchtype=author&query=Varshney%2C+P+K">Pramod K. Varshney</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2301.10815v1-abstract-short" style="display: inline;"> This paper proposes a belief-updating scheme in a human-machine collaborative decision-making network to combat Byzantine attacks. A hierarchical framework is used to realize the network where local decisions from physical sensors act as reference decisions to improve the quality of human sensor decisions. During the decision-making process, the belief that each physical sensor is malicious is upd… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2301.10815v1-abstract-full').style.display = 'inline'; document.getElementById('2301.10815v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2301.10815v1-abstract-full" style="display: none;"> This paper proposes a belief-updating scheme in a human-machine collaborative decision-making network to combat Byzantine attacks. A hierarchical framework is used to realize the network where local decisions from physical sensors act as reference decisions to improve the quality of human sensor decisions. During the decision-making process, the belief that each physical sensor is malicious is updated. The case when humans have side information available is investigated, and its impact is analyzed. Simulation results substantiate that the proposed scheme can significantly improve the quality of human sensor decisions, even when most physical sensors are malicious. Moreover, the performance of the proposed method does not necessarily depend on the knowledge of the actual fraction of malicious physical sensors. Consequently, the proposed scheme can effectively defend against Byzantine attacks and improve the quality of human sensors' decisions so that the performance of the human-machine collaborative system is enhanced. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2301.10815v1-abstract-full').style.display = 'none'; document.getElementById('2301.10815v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 January, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2301.09058">arXiv:2301.09058</a> <span> [<a href="https://arxiv.org/pdf/2301.09058">pdf</a>, <a href="https://arxiv.org/format/2301.09058">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Leveraging Speaker Embeddings with Adversarial Multi-task Learning for Age Group Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Baeg%2C+K">Kwangje Baeg</a>, <a href="/search/eess?searchtype=author&query=Kim%2C+Y">Yeong-Gwan Kim</a>, <a href="/search/eess?searchtype=author&query=Han%2C+Y">Young-Sub Han</a>, <a href="/search/eess?searchtype=author&query=Jeon%2C+B">Byoung-Ki Jeon</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2301.09058v1-abstract-short" style="display: inline;"> Recently, researchers have utilized neural network-based speaker embedding techniques in speaker-recognition tasks to identify speakers accurately. However, speaker-discriminative embeddings do not always represent speech features such as age group well. In an embedding model that has been highly trained to capture speaker traits, the task of age group classification is closer to speech informatio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2301.09058v1-abstract-full').style.display = 'inline'; document.getElementById('2301.09058v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2301.09058v1-abstract-full" style="display: none;"> Recently, researchers have utilized neural network-based speaker embedding techniques in speaker-recognition tasks to identify speakers accurately. However, speaker-discriminative embeddings do not always represent speech features such as age group well. In an embedding model that has been highly trained to capture speaker traits, the task of age group classification is closer to speech information leakage. Hence, to improve age group classification performance, we consider the use of speaker-discriminative embeddings derived from adversarial multi-task learning to align features and reduce the domain discrepancy in age subgroups. In addition, we investigated different types of speaker embeddings to learn and generalize the domain-invariant representations for age groups. Experimental results on the VoxCeleb Enrichment dataset verify the effectiveness of our proposed adaptive adversarial network in multi-objective scenarios and leveraging speaker embeddings for the domain adaptation task. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2301.09058v1-abstract-full').style.display = 'none'; document.getElementById('2301.09058v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 January, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2023. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Han%2C+Y&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Han%2C+Y&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Han%2C+Y&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Han%2C+Y&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository