Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 168 results for author: <span class="mathjax">Luo, Y</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/eess" aria-role="search"> Searching in archive <strong>eess</strong>. <a href="/search/?searchtype=author&query=Luo%2C+Y">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Luo, Y"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Luo%2C+Y&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Luo, Y"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Luo%2C+Y&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Luo%2C+Y&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Luo%2C+Y&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Luo%2C+Y&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Luo%2C+Y&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12906">arXiv:2411.12906</a> <span> [<a href="https://arxiv.org/pdf/2411.12906">pdf</a>, <a href="https://arxiv.org/format/2411.12906">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Experimental Study of Underwater Acoustic Reconfigurable Intelligent Surfaces with In-Phase and Quadrature Modulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Luo%2C+Y">Yu Luo</a>, <a href="/search/eess?searchtype=author&query=Pu%2C+L">Lina Pu</a>, <a href="/search/eess?searchtype=author&query=Song%2C+A">Aijun Song</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12906v1-abstract-short" style="display: inline;"> This paper presents an underwater acoustic reconfigurable intelligent surfaces (UA-RIS) designed for long-range, high-speed, and environmentally friendly communication in oceanic environments. The proposed UA-RIS comprises multiple pairs of acoustic reflectors that utilize in-phase and quadrature (IQ) modulation to flexibly control the amplitude and phase of reflected waves. This capability enable… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12906v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12906v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12906v1-abstract-full" style="display: none;"> This paper presents an underwater acoustic reconfigurable intelligent surfaces (UA-RIS) designed for long-range, high-speed, and environmentally friendly communication in oceanic environments. The proposed UA-RIS comprises multiple pairs of acoustic reflectors that utilize in-phase and quadrature (IQ) modulation to flexibly control the amplitude and phase of reflected waves. This capability enables precise beam steering to enhance or attenuate sound levels in specific directions. A prototype UA-RIS with 4*6 acoustic reflection units is constructed and tested in both tank and lake environments to evaluate performance. The experimental results indicate that the prototype is capable of effectively pointing reflected waves to targeted directions while minimizing side lobes using passive IQ modulation. Field tests reveal that deploying the UA-RIS on the sender side considerably extends communication ranges by 28% in deep water and 46% in shallow waters. Furthermore, with a fixed communication distance, positioning the UA-RIS at the transmitter side substantially boosts data rates, with an average increase of 63.8% and peaks up to 96%. When positioned on the receiver side, the UA-RIS can expand the communication range in shallow and deep water environments by 40.6% and 66%, respectively. Moreover, placing the UA-RIS close to the receiver enhances data rates by an average of 80.3%, reaching up to 163% under certain circumstances. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12906v1-abstract-full').style.display = 'none'; document.getElementById('2411.12906v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 17 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06526">arXiv:2411.06526</a> <span> [<a href="https://arxiv.org/pdf/2411.06526">pdf</a>, <a href="https://arxiv.org/ps/2411.06526">ps</a>, <a href="https://arxiv.org/format/2411.06526">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> AE-DENet: Enhancement for Deep Learning-based Channel Estimation in OFDM Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Fola%2C+E">Ephrem Fola</a>, <a href="/search/eess?searchtype=author&query=Luo%2C+Y">Yang Luo</a>, <a href="/search/eess?searchtype=author&query=Luo%2C+C">Chunbo Luo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06526v2-abstract-short" style="display: inline;"> Deep learning (DL)-based methods have demonstrated remarkable achievements in addressing orthogonal frequency division multiplexing (OFDM) channel estimation challenges. However, existing DL-based methods mainly rely on separate real and imaginary inputs while ignoring the inherent correlation between the two streams, such as amplitude and phase information that are fundamental in communication si… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06526v2-abstract-full').style.display = 'inline'; document.getElementById('2411.06526v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06526v2-abstract-full" style="display: none;"> Deep learning (DL)-based methods have demonstrated remarkable achievements in addressing orthogonal frequency division multiplexing (OFDM) channel estimation challenges. However, existing DL-based methods mainly rely on separate real and imaginary inputs while ignoring the inherent correlation between the two streams, such as amplitude and phase information that are fundamental in communication signal processing. This paper proposes AE-DENet, a novel autoencoder(AE)-based data enhancement network to improve the performance of existing DL-based channel estimation methods. AE-DENet focuses on enriching the classic least square (LS) estimation input commonly used in DL-based methods by employing a learning-based data enhancement method, which extracts interaction features from the real and imaginary components and fuses them with the original real/imaginary streams to generate an enhanced input for better channel inference. Experimental findings in terms of the mean square error (MSE) results demonstrate that the proposed method enhances the performance of all state-of-the-art DL-based channel estimators with negligible added complexity. Furthermore, the proposed approach is shown to be robust to channel variations and high user mobility. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06526v2-abstract-full').style.display = 'none'; document.getElementById('2411.06526v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper is accepted for IEEE GLOBECOM 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13720">arXiv:2410.13720</a> <span> [<a href="https://arxiv.org/pdf/2410.13720">pdf</a>, <a href="https://arxiv.org/format/2410.13720">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Movie Gen: A Cast of Media Foundation Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Polyak%2C+A">Adam Polyak</a>, <a href="/search/eess?searchtype=author&query=Zohar%2C+A">Amit Zohar</a>, <a href="/search/eess?searchtype=author&query=Brown%2C+A">Andrew Brown</a>, <a href="/search/eess?searchtype=author&query=Tjandra%2C+A">Andros Tjandra</a>, <a href="/search/eess?searchtype=author&query=Sinha%2C+A">Animesh Sinha</a>, <a href="/search/eess?searchtype=author&query=Lee%2C+A">Ann Lee</a>, <a href="/search/eess?searchtype=author&query=Vyas%2C+A">Apoorv Vyas</a>, <a href="/search/eess?searchtype=author&query=Shi%2C+B">Bowen Shi</a>, <a href="/search/eess?searchtype=author&query=Ma%2C+C">Chih-Yao Ma</a>, <a href="/search/eess?searchtype=author&query=Chuang%2C+C">Ching-Yao Chuang</a>, <a href="/search/eess?searchtype=author&query=Yan%2C+D">David Yan</a>, <a href="/search/eess?searchtype=author&query=Choudhary%2C+D">Dhruv Choudhary</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+D">Dingkang Wang</a>, <a href="/search/eess?searchtype=author&query=Sethi%2C+G">Geet Sethi</a>, <a href="/search/eess?searchtype=author&query=Pang%2C+G">Guan Pang</a>, <a href="/search/eess?searchtype=author&query=Ma%2C+H">Haoyu Ma</a>, <a href="/search/eess?searchtype=author&query=Misra%2C+I">Ishan Misra</a>, <a href="/search/eess?searchtype=author&query=Hou%2C+J">Ji Hou</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+J">Jialiang Wang</a>, <a href="/search/eess?searchtype=author&query=Jagadeesh%2C+K">Kiran Jagadeesh</a>, <a href="/search/eess?searchtype=author&query=Li%2C+K">Kunpeng Li</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+L">Luxin Zhang</a>, <a href="/search/eess?searchtype=author&query=Singh%2C+M">Mannat Singh</a>, <a href="/search/eess?searchtype=author&query=Williamson%2C+M">Mary Williamson</a>, <a href="/search/eess?searchtype=author&query=Le%2C+M">Matt Le</a> , et al. (63 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13720v1-abstract-short" style="display: inline;"> We present Movie Gen, a cast of foundation models that generates high-quality, 1080p HD videos with different aspect ratios and synchronized audio. We also show additional capabilities such as precise instruction-based video editing and generation of personalized videos based on a user's image. Our models set a new state-of-the-art on multiple tasks: text-to-video synthesis, video personalization,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13720v1-abstract-full').style.display = 'inline'; document.getElementById('2410.13720v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13720v1-abstract-full" style="display: none;"> We present Movie Gen, a cast of foundation models that generates high-quality, 1080p HD videos with different aspect ratios and synchronized audio. We also show additional capabilities such as precise instruction-based video editing and generation of personalized videos based on a user's image. Our models set a new state-of-the-art on multiple tasks: text-to-video synthesis, video personalization, video editing, video-to-audio generation, and text-to-audio generation. Our largest video generation model is a 30B parameter transformer trained with a maximum context length of 73K video tokens, corresponding to a generated video of 16 seconds at 16 frames-per-second. We show multiple technical innovations and simplifications on the architecture, latent spaces, training objectives and recipes, data curation, evaluation protocols, parallelization techniques, and inference optimizations that allow us to reap the benefits of scaling pre-training data, model size, and training compute for training large scale media generation models. We hope this paper helps the research community to accelerate progress and innovation in media generation models. All videos from this paper are available at https://go.fb.me/MovieGenResearchVideos. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13720v1-abstract-full').style.display = 'none'; document.getElementById('2410.13720v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.08588">arXiv:2410.08588</a> <span> [<a href="https://arxiv.org/pdf/2410.08588">pdf</a>, <a href="https://arxiv.org/format/2410.08588">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ViT3D Alignment of LLaMA3: 3D Medical Image Report Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+S">Siyou Li</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+B">Beining Xu</a>, <a href="/search/eess?searchtype=author&query=Luo%2C+Y">Yihao Luo</a>, <a href="/search/eess?searchtype=author&query=Nie%2C+D">Dong Nie</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+L">Le Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.08588v1-abstract-short" style="display: inline;"> Automatic medical report generation (MRG), which aims to produce detailed text reports from medical images, has emerged as a critical task in this domain. MRG systems can enhance radiological workflows by reducing the time and effort required for report writing, thereby improving diagnostic efficiency. In this work, we present a novel approach for automatic MRG utilizing a multimodal large languag… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08588v1-abstract-full').style.display = 'inline'; document.getElementById('2410.08588v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.08588v1-abstract-full" style="display: none;"> Automatic medical report generation (MRG), which aims to produce detailed text reports from medical images, has emerged as a critical task in this domain. MRG systems can enhance radiological workflows by reducing the time and effort required for report writing, thereby improving diagnostic efficiency. In this work, we present a novel approach for automatic MRG utilizing a multimodal large language model. Specifically, we employed the 3D Vision Transformer (ViT3D) image encoder introduced from M3D-CLIP to process 3D scans and use the Asclepius-Llama3-8B as the language model to generate the text reports by auto-regressive decoding. The experiment shows our model achieved an average Green score of 0.3 on the MRG task validation set and an average accuracy of 0.61 on the visual question answering (VQA) task validation set, outperforming the baseline model. Our approach demonstrates the effectiveness of the ViT3D alignment of LLaMA3 for automatic MRG and VQA tasks by tuning the model on a small dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08588v1-abstract-full').style.display = 'none'; document.getElementById('2410.08588v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.00620">arXiv:2410.00620</a> <span> [<a href="https://arxiv.org/pdf/2410.00620">pdf</a>, <a href="https://arxiv.org/ps/2410.00620">ps</a>, <a href="https://arxiv.org/format/2410.00620">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Differentiable Interacting Multiple Model Particle Filtering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Brady%2C+J">John-Joseph Brady</a>, <a href="/search/eess?searchtype=author&query=Luo%2C+Y">Yuhui Luo</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+W">Wenwu Wang</a>, <a href="/search/eess?searchtype=author&query=Elvira%2C+V">V铆ctor Elvira</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yunpeng Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.00620v1-abstract-short" style="display: inline;"> We propose a sequential Monte Carlo algorithm for parameter learning when the studied model exhibits random discontinuous jumps in behaviour. To facilitate the learning of high dimensional parameter sets, such as those associated to neural networks, we adopt the emerging framework of differentiable particle filtering, wherein parameters are trained by gradient descent. We design a new differentiab… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00620v1-abstract-full').style.display = 'inline'; document.getElementById('2410.00620v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.00620v1-abstract-full" style="display: none;"> We propose a sequential Monte Carlo algorithm for parameter learning when the studied model exhibits random discontinuous jumps in behaviour. To facilitate the learning of high dimensional parameter sets, such as those associated to neural networks, we adopt the emerging framework of differentiable particle filtering, wherein parameters are trained by gradient descent. We design a new differentiable interacting multiple model particle filter to be capable of learning the individual behavioural regimes and the model which controls the jumping simultaneously. In contrast to previous approaches, our algorithm allows control of the computational effort assigned per regime whilst using the probability of being in a given regime to guide sampling. Furthermore, we develop a new gradient estimator that has a lower variance than established approaches and remains fast to compute, for which we prove consistency. We establish new theoretical results of the presented algorithms and demonstrate superior numerical performance compared to the previous state-of-the-art algorithms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00620v1-abstract-full').style.display = 'none'; document.getElementById('2410.00620v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 62M20; 62F12 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.08514">arXiv:2409.08514</a> <span> [<a href="https://arxiv.org/pdf/2409.08514">pdf</a>, <a href="https://arxiv.org/format/2409.08514">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Apollo: Band-sequence Modeling for High-Quality Audio Restoration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+K">Kai Li</a>, <a href="/search/eess?searchtype=author&query=Luo%2C+Y">Yi Luo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.08514v1-abstract-short" style="display: inline;"> Audio restoration has become increasingly significant in modern society, not only due to the demand for high-quality auditory experiences enabled by advanced playback devices, but also because the growing capabilities of generative audio models necessitate high-fidelity audio. Typically, audio restoration is defined as a task of predicting undistorted audio from damaged input, often trained using… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08514v1-abstract-full').style.display = 'inline'; document.getElementById('2409.08514v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.08514v1-abstract-full" style="display: none;"> Audio restoration has become increasingly significant in modern society, not only due to the demand for high-quality auditory experiences enabled by advanced playback devices, but also because the growing capabilities of generative audio models necessitate high-fidelity audio. Typically, audio restoration is defined as a task of predicting undistorted audio from damaged input, often trained using a GAN framework to balance perception and distortion. Since audio degradation is primarily concentrated in mid- and high-frequency ranges, especially due to codecs, a key challenge lies in designing a generator capable of preserving low-frequency information while accurately reconstructing high-quality mid- and high-frequency content. Inspired by recent advancements in high-sample-rate music separation, speech enhancement, and audio codec models, we propose Apollo, a generative model designed for high-sample-rate audio restoration. Apollo employs an explicit frequency band split module to model the relationships between different frequency bands, allowing for more coherent and higher-quality restored audio. Evaluated on the MUSDB18-HQ and MoisesDB datasets, Apollo consistently outperforms existing SR-GAN models across various bit rates and music genres, particularly excelling in complex scenarios involving mixtures of multiple instruments and vocals. Apollo significantly improves music restoration quality while maintaining computational efficiency. The source code for Apollo is publicly available at https://github.com/JusperLee/Apollo. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08514v1-abstract-full').style.display = 'none'; document.getElementById('2409.08514v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Demo Page: https://cslikai.cn/Apollo</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.06948">arXiv:2409.06948</a> <span> [<a href="https://arxiv.org/pdf/2409.06948">pdf</a>, <a href="https://arxiv.org/format/2409.06948">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Equivariant Filter for Tightly Coupled LiDAR-Inertial Odometry </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Tao%2C+A">Anbo Tao</a>, <a href="/search/eess?searchtype=author&query=Luo%2C+Y">Yarong Luo</a>, <a href="/search/eess?searchtype=author&query=Xia%2C+C">Chunxi Xia</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+C">Chi Guo</a>, <a href="/search/eess?searchtype=author&query=Li%2C+X">Xingxing Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.06948v1-abstract-short" style="display: inline;"> Pose estimation is a crucial problem in simultaneous localization and mapping (SLAM). However, developing a robust and consistent state estimator remains a significant challenge, as the traditional extended Kalman filter (EKF) struggles to handle the model nonlinearity, especially for inertial measurement unit (IMU) and light detection and ranging (LiDAR). To provide a consistent and efficient sol… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.06948v1-abstract-full').style.display = 'inline'; document.getElementById('2409.06948v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.06948v1-abstract-full" style="display: none;"> Pose estimation is a crucial problem in simultaneous localization and mapping (SLAM). However, developing a robust and consistent state estimator remains a significant challenge, as the traditional extended Kalman filter (EKF) struggles to handle the model nonlinearity, especially for inertial measurement unit (IMU) and light detection and ranging (LiDAR). To provide a consistent and efficient solution of pose estimation, we propose Eq-LIO, a robust state estimator for tightly coupled LIO systems based on an equivariant filter (EqF). Compared with the invariant Kalman filter based on the $\SE_2(3)$ group structure, the EqF uses the symmetry of the semi-direct product group to couple the system state including IMU bias, navigation state and LiDAR extrinsic calibration state, thereby suppressing linearization error and improving the behavior of the estimator in the event of unexpected state changes. The proposed Eq-LIO owns natural consistency and higher robustness, which is theoretically proven with mathematical derivation and experimentally verified through a series of tests on both public and private datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.06948v1-abstract-full').style.display = 'none'; document.getElementById('2409.06948v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.02070">arXiv:2409.02070</a> <span> [<a href="https://arxiv.org/pdf/2409.02070">pdf</a>, <a href="https://arxiv.org/format/2409.02070">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Explicit Differentiable Slicing and Global Deformation for Cardiac Mesh Reconstruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Luo%2C+Y">Yihao Luo</a>, <a href="/search/eess?searchtype=author&query=Sesia%2C+D">Dario Sesia</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+F">Fanwen Wang</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+Y">Yinzhe Wu</a>, <a href="/search/eess?searchtype=author&query=Ding%2C+W">Wenhao Ding</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+J">Jiahao Huang</a>, <a href="/search/eess?searchtype=author&query=Shi%2C+F">Fadong Shi</a>, <a href="/search/eess?searchtype=author&query=Shah%2C+A">Anoop Shah</a>, <a href="/search/eess?searchtype=author&query=Kaural%2C+A">Amit Kaural</a>, <a href="/search/eess?searchtype=author&query=Mayet%2C+J">Jamil Mayet</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+G">Guang Yang</a>, <a href="/search/eess?searchtype=author&query=Yap%2C+C">ChoonHwai Yap</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.02070v2-abstract-short" style="display: inline;"> Mesh reconstruction of the cardiac anatomy from medical images is useful for shape and motion measurements and biophysics simulations to facilitate the assessment of cardiac function and health. However, 3D medical images are often acquired as 2D slices that are sparsely sampled and noisy, and mesh reconstruction on such data is a challenging task. Traditional voxel-based approaches rely on pre- a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.02070v2-abstract-full').style.display = 'inline'; document.getElementById('2409.02070v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.02070v2-abstract-full" style="display: none;"> Mesh reconstruction of the cardiac anatomy from medical images is useful for shape and motion measurements and biophysics simulations to facilitate the assessment of cardiac function and health. However, 3D medical images are often acquired as 2D slices that are sparsely sampled and noisy, and mesh reconstruction on such data is a challenging task. Traditional voxel-based approaches rely on pre- and post-processing that compromises image fidelity, while mesh-level deep learning approaches require mesh annotations that are difficult to get. Therefore, direct cross-domain supervision from 2D images to meshes is a key technique for advancing 3D learning in medical imaging, but it has not been well-developed. While there have been attempts to approximate the optimized meshes' slicing, few existing methods directly use 2D slices to supervise mesh reconstruction in a differentiable manner. Here, we propose a novel explicit differentiable voxelization and slicing (DVS) algorithm that allows gradient backpropagation to a mesh from its slices, facilitating refined mesh optimization directly supervised by the losses defined on 2D images. Further, we propose an innovative framework for extracting patient-specific left ventricle (LV) meshes from medical images by coupling DVS with a graph harmonic deformation (GHD) mesh morphing descriptor of cardiac shape that naturally preserves mesh quality and smoothness during optimization. Experimental results demonstrate that our method achieves state-of-the-art performance in cardiac mesh reconstruction tasks from CT and MRI, with an overall Dice score of 90% on multi-datasets, outperforming existing approaches. The proposed method can further quantify clinically useful parameters such as ejection fraction and global myocardial strains, closely matching the ground truth and surpassing the traditional voxel-based approach in sparse images. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.02070v2-abstract-full').style.display = 'none'; document.getElementById('2409.02070v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.10807">arXiv:2408.10807</a> <span> [<a href="https://arxiv.org/pdf/2408.10807">pdf</a>, <a href="https://arxiv.org/format/2408.10807">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> DisMix: Disentangling Mixtures of Musical Instruments for Source-level Pitch and Timbre Manipulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Luo%2C+Y">Yin-Jyun Luo</a>, <a href="/search/eess?searchtype=author&query=Cheuk%2C+K+W">Kin Wai Cheuk</a>, <a href="/search/eess?searchtype=author&query=Choi%2C+W">Woosung Choi</a>, <a href="/search/eess?searchtype=author&query=Uesaka%2C+T">Toshimitsu Uesaka</a>, <a href="/search/eess?searchtype=author&query=Toyama%2C+K">Keisuke Toyama</a>, <a href="/search/eess?searchtype=author&query=Saito%2C+K">Koichi Saito</a>, <a href="/search/eess?searchtype=author&query=Lai%2C+C">Chieh-Hsin Lai</a>, <a href="/search/eess?searchtype=author&query=Takida%2C+Y">Yuhta Takida</a>, <a href="/search/eess?searchtype=author&query=Liao%2C+W">Wei-Hsiang Liao</a>, <a href="/search/eess?searchtype=author&query=Dixon%2C+S">Simon Dixon</a>, <a href="/search/eess?searchtype=author&query=Mitsufuji%2C+Y">Yuki Mitsufuji</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.10807v1-abstract-short" style="display: inline;"> Existing work on pitch and timbre disentanglement has been mostly focused on single-instrument music audio, excluding the cases where multiple instruments are presented. To fill the gap, we propose DisMix, a generative framework in which the pitch and timbre representations act as modular building blocks for constructing the melody and instrument of a source, and the collection of which forms a se… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10807v1-abstract-full').style.display = 'inline'; document.getElementById('2408.10807v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.10807v1-abstract-full" style="display: none;"> Existing work on pitch and timbre disentanglement has been mostly focused on single-instrument music audio, excluding the cases where multiple instruments are presented. To fill the gap, we propose DisMix, a generative framework in which the pitch and timbre representations act as modular building blocks for constructing the melody and instrument of a source, and the collection of which forms a set of per-instrument latent representations underlying the observed mixture. By manipulating the representations, our model samples mixtures with novel combinations of pitch and timbre of the constituent instruments. We can jointly learn the disentangled pitch-timbre representations and a latent diffusion transformer that reconstructs the mixture conditioned on the set of source-level representations. We evaluate the model using both a simple dataset of isolated chords and a realistic four-part chorales in the style of J.S. Bach, identify the key components for the success of disentanglement, and demonstrate the application of mixture transformation based on source-level attribute manipulation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10807v1-abstract-full').style.display = 'none'; document.getElementById('2408.10807v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.16634">arXiv:2407.16634</a> <span> [<a href="https://arxiv.org/pdf/2407.16634">pdf</a>, <a href="https://arxiv.org/format/2407.16634">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Knowledge-driven AI-generated data for accurate and interpretable breast ultrasound diagnoses </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yu%2C+H">Haojun Yu</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Youcheng Li</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+N">Nan Zhang</a>, <a href="/search/eess?searchtype=author&query=Niu%2C+Z">Zihan Niu</a>, <a href="/search/eess?searchtype=author&query=Gong%2C+X">Xuantong Gong</a>, <a href="/search/eess?searchtype=author&query=Luo%2C+Y">Yanwen Luo</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+Q">Quanlin Wu</a>, <a href="/search/eess?searchtype=author&query=Qin%2C+W">Wangyan Qin</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+M">Mengyuan Zhou</a>, <a href="/search/eess?searchtype=author&query=Han%2C+J">Jie Han</a>, <a href="/search/eess?searchtype=author&query=Tao%2C+J">Jia Tao</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+Z">Ziwei Zhao</a>, <a href="/search/eess?searchtype=author&query=Dai%2C+D">Di Dai</a>, <a href="/search/eess?searchtype=author&query=He%2C+D">Di He</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+D">Dong Wang</a>, <a href="/search/eess?searchtype=author&query=Tang%2C+B">Binghui Tang</a>, <a href="/search/eess?searchtype=author&query=Huo%2C+L">Ling Huo</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+Q">Qingli Zhu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yong Wang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+L">Liwei Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.16634v1-abstract-short" style="display: inline;"> Data-driven deep learning models have shown great capabilities to assist radiologists in breast ultrasound (US) diagnoses. However, their effectiveness is limited by the long-tail distribution of training data, which leads to inaccuracies in rare cases. In this study, we address a long-standing challenge of improving the diagnostic model performance on rare cases using long-tailed data. Specifical… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.16634v1-abstract-full').style.display = 'inline'; document.getElementById('2407.16634v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.16634v1-abstract-full" style="display: none;"> Data-driven deep learning models have shown great capabilities to assist radiologists in breast ultrasound (US) diagnoses. However, their effectiveness is limited by the long-tail distribution of training data, which leads to inaccuracies in rare cases. In this study, we address a long-standing challenge of improving the diagnostic model performance on rare cases using long-tailed data. Specifically, we introduce a pipeline, TAILOR, that builds a knowledge-driven generative model to produce tailored synthetic data. The generative model, using 3,749 lesions as source data, can generate millions of breast-US images, especially for error-prone rare cases. The generated data can be further used to build a diagnostic model for accurate and interpretable diagnoses. In the prospective external evaluation, our diagnostic model outperforms the average performance of nine radiologists by 33.5% in specificity with the same sensitivity, improving their performance by providing predictions with an interpretable decision-making process. Moreover, on ductal carcinoma in situ (DCIS), our diagnostic model outperforms all radiologists by a large margin, with only 34 DCIS lesions in the source data. We believe that TAILOR can potentially be extended to various diseases and imaging modalities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.16634v1-abstract-full').style.display = 'none'; document.getElementById('2407.16634v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.14754">arXiv:2407.14754</a> <span> [<a href="https://arxiv.org/pdf/2407.14754">pdf</a>, <a href="https://arxiv.org/format/2407.14754">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Representing Topological Self-Similarity Using Fractal Feature Maps for Accurate Segmentation of Tubular Structures </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Huang%2C+J">Jiaxing Huang</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+Y">Yanfeng Zhou</a>, <a href="/search/eess?searchtype=author&query=Luo%2C+Y">Yaoru Luo</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+G">Guole Liu</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+H">Heng Guo</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+G">Ge Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.14754v1-abstract-short" style="display: inline;"> Accurate segmentation of long and thin tubular structures is required in a wide variety of areas such as biology, medicine, and remote sensing. The complex topology and geometry of such structures often pose significant technical challenges. A fundamental property of such structures is their topological self-similarity, which can be quantified by fractal features such as fractal dimension (FD). In… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.14754v1-abstract-full').style.display = 'inline'; document.getElementById('2407.14754v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.14754v1-abstract-full" style="display: none;"> Accurate segmentation of long and thin tubular structures is required in a wide variety of areas such as biology, medicine, and remote sensing. The complex topology and geometry of such structures often pose significant technical challenges. A fundamental property of such structures is their topological self-similarity, which can be quantified by fractal features such as fractal dimension (FD). In this study, we incorporate fractal features into a deep learning model by extending FD to the pixel-level using a sliding window technique. The resulting fractal feature maps (FFMs) are then incorporated as additional input to the model and additional weight in the loss function to enhance segmentation performance by utilizing the topological self-similarity. Moreover, we extend the U-Net architecture by incorporating an edge decoder and a skeleton decoder to improve boundary accuracy and skeletal continuity of segmentation, respectively. Extensive experiments on five tubular structure datasets validate the effectiveness and robustness of our approach. Furthermore, the integration of FFMs with other popular segmentation models such as HR-Net also yields performance enhancement, suggesting FFM can be incorporated as a plug-in module with different model architectures. Code and data are openly accessible at https://github.com/cbmi-group/FFM-Multi-Decoder-Network. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.14754v1-abstract-full').style.display = 'none'; document.getElementById('2407.14754v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.08813">arXiv:2407.08813</a> <span> [<a href="https://arxiv.org/pdf/2407.08813">pdf</a>, <a href="https://arxiv.org/format/2407.08813">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> FairDomain: Achieving Fairness in Cross-Domain Medical Image Segmentation and Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Tian%2C+Y">Yu Tian</a>, <a href="/search/eess?searchtype=author&query=Wen%2C+C">Congcong Wen</a>, <a href="/search/eess?searchtype=author&query=Shi%2C+M">Min Shi</a>, <a href="/search/eess?searchtype=author&query=Afzal%2C+M+M">Muhammad Muneeb Afzal</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+H">Hao Huang</a>, <a href="/search/eess?searchtype=author&query=Khan%2C+M+O">Muhammad Osama Khan</a>, <a href="/search/eess?searchtype=author&query=Luo%2C+Y">Yan Luo</a>, <a href="/search/eess?searchtype=author&query=Fang%2C+Y">Yi Fang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+M">Mengyu Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.08813v2-abstract-short" style="display: inline;"> Addressing fairness in artificial intelligence (AI), particularly in medical AI, is crucial for ensuring equitable healthcare outcomes. Recent efforts to enhance fairness have introduced new methodologies and datasets in medical AI. However, the fairness issue under the setting of domain transfer is almost unexplored, while it is common that clinics rely on different imaging technologies (e.g., di… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.08813v2-abstract-full').style.display = 'inline'; document.getElementById('2407.08813v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.08813v2-abstract-full" style="display: none;"> Addressing fairness in artificial intelligence (AI), particularly in medical AI, is crucial for ensuring equitable healthcare outcomes. Recent efforts to enhance fairness have introduced new methodologies and datasets in medical AI. However, the fairness issue under the setting of domain transfer is almost unexplored, while it is common that clinics rely on different imaging technologies (e.g., different retinal imaging modalities) for patient diagnosis. This paper presents FairDomain, a pioneering systemic study into algorithmic fairness under domain shifts, employing state-of-the-art domain adaptation (DA) and generalization (DG) algorithms for both medical segmentation and classification tasks to understand how biases are transferred between different domains. We also introduce a novel plug-and-play fair identity attention (FIA) module that adapts to various DA and DG algorithms to improve fairness by using self-attention to adjust feature importance based on demographic attributes. Additionally, we curate the first fairness-focused dataset with two paired imaging modalities for the same patient cohort on medical segmentation and classification tasks, to rigorously assess fairness in domain-shift scenarios. Excluding the confounding impact of demographic distribution variation between source and target domains will allow clearer quantification of the performance of domain transfer models. Our extensive evaluations reveal that the proposed FIA significantly enhances both model performance accounted for fairness across all domain shift settings (i.e., DA and DG) with respect to different demographics, which outperforms existing methods on both segmentation and classification. The code and data can be accessed at https://ophai.hms.harvard.edu/datasets/harvard-fairdomain20k. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.08813v2-abstract-full').style.display = 'none'; document.getElementById('2407.08813v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ECCV 2024; Codes and datasets are available at https://github.com/Harvard-Ophthalmology-AI-Lab/FairDomain</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.02005">arXiv:2407.02005</a> <span> [<a href="https://arxiv.org/pdf/2407.02005">pdf</a>, <a href="https://arxiv.org/format/2407.02005">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> An End-to-End Speech Summarization Using Large Language Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Shang%2C+H">Hengchao Shang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Z">Zongyao Li</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+J">Jiaxin Guo</a>, <a href="/search/eess?searchtype=author&query=Li%2C+S">Shaojun Li</a>, <a href="/search/eess?searchtype=author&query=Rao%2C+Z">Zhiqiang Rao</a>, <a href="/search/eess?searchtype=author&query=Luo%2C+Y">Yuanchang Luo</a>, <a href="/search/eess?searchtype=author&query=Wei%2C+D">Daimeng Wei</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+H">Hao Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.02005v1-abstract-short" style="display: inline;"> Abstractive Speech Summarization (SSum) aims to generate human-like text summaries from spoken content. It encounters difficulties in handling long speech input and capturing the intricate cross-modal mapping between long speech inputs and short text summaries. Research on large language models (LLMs) and multimodal information fusion has provided new insights for addressing these challenges. In t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.02005v1-abstract-full').style.display = 'inline'; document.getElementById('2407.02005v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.02005v1-abstract-full" style="display: none;"> Abstractive Speech Summarization (SSum) aims to generate human-like text summaries from spoken content. It encounters difficulties in handling long speech input and capturing the intricate cross-modal mapping between long speech inputs and short text summaries. Research on large language models (LLMs) and multimodal information fusion has provided new insights for addressing these challenges. In this paper, we propose an end-to-end SSum model that utilizes Q-Former as a connector for the audio-text modality and employs LLMs to generate text summaries directly from speech features. We adopt a multi-stage training approach that includes LLM based ASR and Text Summarization (TSum) tasks as auxiliary tasks. ASR tasks are used to align feature spaces and enhance the LLM's ability to handle longer speech. Then, we utilize a curriculum learning strategy to facilitate the model's transition from TSum to SSum. Finally, our model achieves competitive performance on the How-2 dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.02005v1-abstract-full').style.display = 'none'; document.getElementById('2407.02005v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">InterSpeech 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.01860">arXiv:2407.01860</a> <span> [<a href="https://arxiv.org/pdf/2407.01860">pdf</a>, <a href="https://arxiv.org/format/2407.01860">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.23919/EUSIPCO63174.2024.10715159">10.23919/EUSIPCO63174.2024.10715159 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Constant Directivity Loudspeaker Beamforming </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Luo%2C+Y">Yuancheng Luo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.01860v3-abstract-short" style="display: inline;"> Loudspeaker array beamforming is a common signal processing technique for acoustic directivity control and robust audio reproduction. Unlike their microphone counterpart, loudspeaker constraints are often heterogeneous due to arrayed transducers with varying operating ranges in frequency, acoustic-electrical sensitivity, efficiency, and directivity. This work proposes a frequency-regularization me… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.01860v3-abstract-full').style.display = 'inline'; document.getElementById('2407.01860v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.01860v3-abstract-full" style="display: none;"> Loudspeaker array beamforming is a common signal processing technique for acoustic directivity control and robust audio reproduction. Unlike their microphone counterpart, loudspeaker constraints are often heterogeneous due to arrayed transducers with varying operating ranges in frequency, acoustic-electrical sensitivity, efficiency, and directivity. This work proposes a frequency-regularization method for generalized Rayleigh quotient directivity specifications and two novel beamformer designs that optimize for maximum efficiency constant directivity (MECD) and maximum sensitivity constant directivity (MSCD). We derive fast converging and analytic solutions from their quadratic equality constrained quadratic program formulations. Experiments optimize generalized directivity index constrained beamformer designs for a full-band heterogeneous array. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.01860v3-abstract-full').style.display = 'none'; document.getElementById('2407.01860v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at EUSIPCO 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.14878">arXiv:2406.14878</a> <span> [<a href="https://arxiv.org/pdf/2406.14878">pdf</a>, <a href="https://arxiv.org/format/2406.14878">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> MOS: Model Synergy for Test-Time Adaptation on LiDAR-Based 3D Object Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Chen%2C+Z">Zhuoxiao Chen</a>, <a href="/search/eess?searchtype=author&query=Meng%2C+J">Junjie Meng</a>, <a href="/search/eess?searchtype=author&query=Baktashmotlagh%2C+M">Mahsa Baktashmotlagh</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Y">Yonggang Zhang</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+Z">Zi Huang</a>, <a href="/search/eess?searchtype=author&query=Luo%2C+Y">Yadan Luo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.14878v2-abstract-short" style="display: inline;"> LiDAR-based 3D object detection is crucial for various applications but often experiences performance degradation in real-world deployments due to domain shifts. While most studies focus on cross-dataset shifts, such as changes in environments and object geometries, practical corruptions from sensor variations and weather conditions remain underexplored. In this work, we propose a novel online tes… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.14878v2-abstract-full').style.display = 'inline'; document.getElementById('2406.14878v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.14878v2-abstract-full" style="display: none;"> LiDAR-based 3D object detection is crucial for various applications but often experiences performance degradation in real-world deployments due to domain shifts. While most studies focus on cross-dataset shifts, such as changes in environments and object geometries, practical corruptions from sensor variations and weather conditions remain underexplored. In this work, we propose a novel online test-time adaptation framework for 3D detectors that effectively tackles these shifts, including a challenging cross-corruption scenario where cross-dataset shifts and corruptions co-occur. By leveraging long-term knowledge from previous test batches, our approach mitigates catastrophic forgetting and adapts effectively to diverse shifts. Specifically, we propose a Model Synergy (MOS) strategy that dynamically selects historical checkpoints with diverse knowledge and assembles them to best accommodate the current test batch. This assembly is directed by our proposed Synergy Weights (SW), which perform a weighted averaging of the selected checkpoints, minimizing redundancy in the composite model. The SWs are computed by evaluating the similarity of predicted bounding boxes on the test data and the independence of features between checkpoint pairs in the model bank. To maintain an efficient and informative model bank, we discard checkpoints with the lowest average SW scores, replacing them with newly updated models. Our method was rigorously tested against existing test-time adaptation strategies across three datasets and eight types of corruptions, demonstrating superior adaptability to dynamic scenes and conditions. Notably, it achieved a 67.3% improvement in a challenging cross-corruption scenario, offering a more comprehensive benchmark for adaptation. The source code will be made publicly available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.14878v2-abstract-full').style.display = 'none'; document.getElementById('2406.14878v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.13788">arXiv:2406.13788</a> <span> [<a href="https://arxiv.org/pdf/2406.13788">pdf</a>, <a href="https://arxiv.org/format/2406.13788">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Groupwise Deformable Registration of Diffusion Tensor Cardiovascular Magnetic Resonance: Disentangling Diffusion Contrast, Respiratory and Cardiac Motions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+F">Fanwen Wang</a>, <a href="/search/eess?searchtype=author&query=Luo%2C+Y">Yihao Luo</a>, <a href="/search/eess?searchtype=author&query=Wen%2C+K">Ke Wen</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+J">Jiahao Huang</a>, <a href="/search/eess?searchtype=author&query=Ferreira%2C+P+F">Pedro F. Ferreira</a>, <a href="/search/eess?searchtype=author&query=Luo%2C+Y">Yaqing Luo</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+Y">Yinzhe Wu</a>, <a href="/search/eess?searchtype=author&query=Munoz%2C+C">Camila Munoz</a>, <a href="/search/eess?searchtype=author&query=Pennell%2C+D+J">Dudley J. Pennell</a>, <a href="/search/eess?searchtype=author&query=Scott%2C+A+D">Andrew D. Scott</a>, <a href="/search/eess?searchtype=author&query=Nielles-Vallespin%2C+S">Sonia Nielles-Vallespin</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+G">Guang Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.13788v2-abstract-short" style="display: inline;"> Diffusion tensor based cardiovascular magnetic resonance (DT-CMR) offers a non-invasive method to visualize the myocardial microstructure. With the assumption that the heart is stationary, frames are acquired with multiple repetitions for different diffusion encoding directions. However, motion from poor breath-holding and imprecise cardiac triggering complicates DT-CMR analysis, further challenge… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.13788v2-abstract-full').style.display = 'inline'; document.getElementById('2406.13788v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.13788v2-abstract-full" style="display: none;"> Diffusion tensor based cardiovascular magnetic resonance (DT-CMR) offers a non-invasive method to visualize the myocardial microstructure. With the assumption that the heart is stationary, frames are acquired with multiple repetitions for different diffusion encoding directions. However, motion from poor breath-holding and imprecise cardiac triggering complicates DT-CMR analysis, further challenged by its inherently low SNR, varied contrasts, and diffusion induced textures. Our solution is a novel framework employing groupwise registration with an implicit template to isolate respiratory and cardiac motions, while a tensor-embedded branch preserves diffusion contrast textures. We have devised a loss refinement tailored for non-linear least squares fitting and low SNR conditions. Additionally, we introduce new physics-based and clinical metrics for performance evaluation. Access code and supplementary materials at: https://github.com/ayanglab/DTCMR-Reg <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.13788v2-abstract-full').style.display = 'none'; document.getElementById('2406.13788v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by MICCAI 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.13708">arXiv:2406.13708</a> <span> [<a href="https://arxiv.org/pdf/2406.13708">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Medical Physics">physics.med-ph</span> </div> </div> <p class="title is-5 mathjax"> Low-rank based motion correction followed by automatic frame selection in DT-CMR </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+F">Fanwen Wang</a>, <a href="/search/eess?searchtype=author&query=Ferreira%2C+P+F">Pedro F. Ferreira</a>, <a href="/search/eess?searchtype=author&query=Munoz%2C+C">Camila Munoz</a>, <a href="/search/eess?searchtype=author&query=Wen%2C+K">Ke Wen</a>, <a href="/search/eess?searchtype=author&query=Luo%2C+Y">Yaqing Luo</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+J">Jiahao Huang</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+Y">Yinzhe Wu</a>, <a href="/search/eess?searchtype=author&query=Pennell%2C+D+J">Dudley J. Pennell</a>, <a href="/search/eess?searchtype=author&query=Scott%2C+A+D">Andrew D. Scott</a>, <a href="/search/eess?searchtype=author&query=Nielles-Vallespin%2C+S">Sonia Nielles-Vallespin</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+G">Guang Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.13708v1-abstract-short" style="display: inline;"> Motivation: Post-processing of in-vivo diffusion tensor CMR (DT-CMR) is challenging due to the low SNR and variation in contrast between frames which makes image registration difficult, and the need to manually reject frames corrupted by motion. Goals: To develop a semi-automatic post-processing pipeline for robust DT-CMR registration and automatic frame selection. Approach: We used low intrinsic… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.13708v1-abstract-full').style.display = 'inline'; document.getElementById('2406.13708v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.13708v1-abstract-full" style="display: none;"> Motivation: Post-processing of in-vivo diffusion tensor CMR (DT-CMR) is challenging due to the low SNR and variation in contrast between frames which makes image registration difficult, and the need to manually reject frames corrupted by motion. Goals: To develop a semi-automatic post-processing pipeline for robust DT-CMR registration and automatic frame selection. Approach: We used low intrinsic rank averaged frames as the reference to register other low-ranked frames. A myocardium-guided frame selection rejected the frames with signal loss, through-plane motion and poor registration. Results: The proposed method outperformed our previous noise-robust rigid registration on helix angle data quality and reduced negative eigenvalues in healthy volunteers. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.13708v1-abstract-full').style.display = 'none'; document.getElementById('2406.13708v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted as ISMRM 2024 Digital poster 2141</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> ISMRM 2024 Digital poster 2141 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.09317">arXiv:2406.09317</a> <span> [<a href="https://arxiv.org/pdf/2406.09317">pdf</a>, <a href="https://arxiv.org/format/2406.09317">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Common and Rare Fundus Diseases Identification Using Vision-Language Foundation Model with Knowledge of Over 400 Diseases </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+M">Meng Wang</a>, <a href="/search/eess?searchtype=author&query=Lin%2C+T">Tian Lin</a>, <a href="/search/eess?searchtype=author&query=Lin%2C+A">Aidi Lin</a>, <a href="/search/eess?searchtype=author&query=Yu%2C+K">Kai Yu</a>, <a href="/search/eess?searchtype=author&query=Peng%2C+Y">Yuanyuan Peng</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+L">Lianyu Wang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+C">Cheng Chen</a>, <a href="/search/eess?searchtype=author&query=Zou%2C+K">Ke Zou</a>, <a href="/search/eess?searchtype=author&query=Liang%2C+H">Huiyu Liang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+M">Man Chen</a>, <a href="/search/eess?searchtype=author&query=Yao%2C+X">Xue Yao</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+M">Meiqin Zhang</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+B">Binwei Huang</a>, <a href="/search/eess?searchtype=author&query=Zheng%2C+C">Chaoxin Zheng</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+P">Peixin Zhang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+W">Wei Chen</a>, <a href="/search/eess?searchtype=author&query=Luo%2C+Y">Yilong Luo</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yifan Chen</a>, <a href="/search/eess?searchtype=author&query=Xia%2C+H">Honghe Xia</a>, <a href="/search/eess?searchtype=author&query=Shi%2C+T">Tingkun Shi</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Q">Qi Zhang</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+J">Jinming Guo</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+X">Xiaolin Chen</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+J">Jingcheng Wang</a>, <a href="/search/eess?searchtype=author&query=Tham%2C+Y+C">Yih Chung Tham</a> , et al. (24 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.09317v2-abstract-short" style="display: inline;"> Previous foundation models for retinal images were pre-trained with limited disease categories and knowledge base. Here we introduce RetiZero, a vision-language foundation model that leverages knowledge from over 400 fundus diseases. To RetiZero's pre-training, we compiled 341,896 fundus images paired with text descriptions, sourced from public datasets, ophthalmic literature, and online resources… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.09317v2-abstract-full').style.display = 'inline'; document.getElementById('2406.09317v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.09317v2-abstract-full" style="display: none;"> Previous foundation models for retinal images were pre-trained with limited disease categories and knowledge base. Here we introduce RetiZero, a vision-language foundation model that leverages knowledge from over 400 fundus diseases. To RetiZero's pre-training, we compiled 341,896 fundus images paired with text descriptions, sourced from public datasets, ophthalmic literature, and online resources, encompassing a diverse range of diseases across multiple ethnicities and countries. RetiZero exhibits superior performance in several downstream tasks, including zero-shot disease recognition, image-to-image retrieval, and internal- and cross-domain disease identification. In zero-shot scenarios, RetiZero achieves Top5 accuracy scores of 0.8430 for 15 fundus diseases and 0.7561 for 52 fundus diseases. For image retrieval, it achieves Top5 scores of 0.9500 and 0.8860 for the same disease sets, respectively. Clinical evaluations show that RetiZero's Top3 zero-shot performance surpasses the average of 19 ophthalmologists from Singapore, China and the United States. Furthermore, RetiZero significantly enhances clinicians' accuracy in diagnosing fundus disease. These findings underscore the value of integrating the RetiZero foundation model into clinical settings, where a variety of fundus diseases are encountered. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.09317v2-abstract-full').style.display = 'none'; document.getElementById('2406.09317v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.04791">arXiv:2406.04791</a> <span> [<a href="https://arxiv.org/pdf/2406.04791">pdf</a>, <a href="https://arxiv.org/format/2406.04791">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Speaker-Smoothed kNN Speaker Adaptation for End-to-End ASR </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+S">Shaojun Li</a>, <a href="/search/eess?searchtype=author&query=Wei%2C+D">Daimeng Wei</a>, <a href="/search/eess?searchtype=author&query=Shang%2C+H">Hengchao Shang</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+J">Jiaxin Guo</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Z">ZongYao Li</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+Z">Zhanglin Wu</a>, <a href="/search/eess?searchtype=author&query=Rao%2C+Z">Zhiqiang Rao</a>, <a href="/search/eess?searchtype=author&query=Luo%2C+Y">Yuanchang Luo</a>, <a href="/search/eess?searchtype=author&query=He%2C+X">Xianghui He</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+H">Hao Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.04791v3-abstract-short" style="display: inline;"> Despite recent improvements in End-to-End Automatic Speech Recognition (E2E ASR) systems, the performance can degrade due to vocal characteristic mismatches between training and testing data, particularly with limited target speaker adaptation data. We propose a novel speaker adaptation approach Speaker-Smoothed kNN that leverages k-Nearest Neighbors (kNN) retrieval techniques to improve model out… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.04791v3-abstract-full').style.display = 'inline'; document.getElementById('2406.04791v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.04791v3-abstract-full" style="display: none;"> Despite recent improvements in End-to-End Automatic Speech Recognition (E2E ASR) systems, the performance can degrade due to vocal characteristic mismatches between training and testing data, particularly with limited target speaker adaptation data. We propose a novel speaker adaptation approach Speaker-Smoothed kNN that leverages k-Nearest Neighbors (kNN) retrieval techniques to improve model output by finding correctly pronounced tokens from its pre-built datastore during the decoding phase. Moreover, we utilize x-vector to dynamically adjust kNN interpolation parameters for data sparsity issue. This approach was validated using KeSpeech and MagicData corpora under in-domain and all-domain settings. Our method consistently performs comparably to fine-tuning without the associated performance degradation during speaker changes. Furthermore, in the all-domain setting, our method achieves state-of-the-art results, reducing the CER in both single speaker and multi-speaker test scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.04791v3-abstract-full').style.display = 'none'; document.getElementById('2406.04791v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to Interspeech 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.00497">arXiv:2406.00497</a> <span> [<a href="https://arxiv.org/pdf/2406.00497">pdf</a>, <a href="https://arxiv.org/ps/2406.00497">ps</a>, <a href="https://arxiv.org/format/2406.00497">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Recent Advances in End-to-End Simultaneous Speech Translation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Liu%2C+X">Xiaoqian Liu</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+G">Guoqiang Hu</a>, <a href="/search/eess?searchtype=author&query=Du%2C+Y">Yangfan Du</a>, <a href="/search/eess?searchtype=author&query=He%2C+E">Erfeng He</a>, <a href="/search/eess?searchtype=author&query=Luo%2C+Y">Yingfeng Luo</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+C">Chen Xu</a>, <a href="/search/eess?searchtype=author&query=Xiao%2C+T">Tong Xiao</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+J">Jingbo Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.00497v2-abstract-short" style="display: inline;"> Simultaneous speech translation (SimulST) is a demanding task that involves generating translations in real-time while continuously processing speech input. This paper offers a comprehensive overview of the recent developments in SimulST research, focusing on four major challenges. Firstly, the complexities associated with processing lengthy and continuous speech streams pose significant hurdles.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.00497v2-abstract-full').style.display = 'inline'; document.getElementById('2406.00497v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.00497v2-abstract-full" style="display: none;"> Simultaneous speech translation (SimulST) is a demanding task that involves generating translations in real-time while continuously processing speech input. This paper offers a comprehensive overview of the recent developments in SimulST research, focusing on four major challenges. Firstly, the complexities associated with processing lengthy and continuous speech streams pose significant hurdles. Secondly, satisfying real-time requirements presents inherent difficulties due to the need for immediate translation output. Thirdly, striking a balance between translation quality and latency constraints remains a critical challenge. Finally, the scarcity of annotated data adds another layer of complexity to the task. Through our exploration of these challenges and the proposed solutions, we aim to provide valuable insights into the current landscape of SimulST research and suggest promising directions for future exploration. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.00497v2-abstract-full').style.display = 'none'; document.getElementById('2406.00497v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by IJCAI 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.00492">arXiv:2406.00492</a> <span> [<a href="https://arxiv.org/pdf/2406.00492">pdf</a>, <a href="https://arxiv.org/format/2406.00492">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> SAM-VMNet: Deep Neural Networks For Coronary Angiography Vessel Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zeng%2C+X">Xueying Zeng</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+B">Baixiang Huang</a>, <a href="/search/eess?searchtype=author&query=Luo%2C+Y">Yu Luo</a>, <a href="/search/eess?searchtype=author&query=Wei%2C+G">Guangyu Wei</a>, <a href="/search/eess?searchtype=author&query=He%2C+S">Songyan He</a>, <a href="/search/eess?searchtype=author&query=Shao%2C+Y">Yushuang Shao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.00492v1-abstract-short" style="display: inline;"> Coronary artery disease (CAD) is one of the most prevalent diseases in the cardiovascular field and one of the major contributors to death worldwide. Computed Tomography Angiography (CTA) images are regarded as the authoritative standard for the diagnosis of coronary artery disease, and by performing vessel segmentation and stenosis detection on CTA images, physicians are able to diagnose coronary… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.00492v1-abstract-full').style.display = 'inline'; document.getElementById('2406.00492v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.00492v1-abstract-full" style="display: none;"> Coronary artery disease (CAD) is one of the most prevalent diseases in the cardiovascular field and one of the major contributors to death worldwide. Computed Tomography Angiography (CTA) images are regarded as the authoritative standard for the diagnosis of coronary artery disease, and by performing vessel segmentation and stenosis detection on CTA images, physicians are able to diagnose coronary artery disease more accurately. In order to combine the advantages of both the base model and the domain-specific model, and to achieve high-precision and fully-automatic segmentation and detection with a limited number of training samples, we propose a novel architecture, SAM-VMNet, which combines the powerful feature extraction capability of MedSAM with the advantage of the linear complexity of the visual state-space model of VM-UNet, giving it faster inferences than Vision Transformer with faster inference speed and stronger data processing capability, achieving higher segmentation accuracy and stability for CTA images. Experimental results show that the SAM-VMNet architecture performs excellently in the CTA image segmentation task, with a segmentation accuracy of up to 98.32% and a sensitivity of up to 99.33%, which is significantly better than other existing models and has stronger domain adaptability. Comprehensive evaluation of the CTA image segmentation task shows that SAM-VMNet accurately extracts the vascular trunks and capillaries, demonstrating its great potential and wide range of application scenarios for the vascular segmentation task, and also laying a solid foundation for further stenosis detection. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.00492v1-abstract-full').style.display = 'none'; document.getElementById('2406.00492v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.17241">arXiv:2405.17241</a> <span> [<a href="https://arxiv.org/pdf/2405.17241">pdf</a>, <a href="https://arxiv.org/format/2405.17241">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> NeurTV: Total Variation on the Neural Domain </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Luo%2C+Y">Yisi Luo</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+X">Xile Zhao</a>, <a href="/search/eess?searchtype=author&query=Ye%2C+K">Kai Ye</a>, <a href="/search/eess?searchtype=author&query=Meng%2C+D">Deyu Meng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.17241v1-abstract-short" style="display: inline;"> Recently, we have witnessed the success of total variation (TV) for many imaging applications. However, traditional TV is defined on the original pixel domain, which limits its potential. In this work, we suggest a new TV regularization defined on the neural domain. Concretely, the discrete data is continuously and implicitly represented by a deep neural network (DNN), and we use the derivatives o… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.17241v1-abstract-full').style.display = 'inline'; document.getElementById('2405.17241v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.17241v1-abstract-full" style="display: none;"> Recently, we have witnessed the success of total variation (TV) for many imaging applications. However, traditional TV is defined on the original pixel domain, which limits its potential. In this work, we suggest a new TV regularization defined on the neural domain. Concretely, the discrete data is continuously and implicitly represented by a deep neural network (DNN), and we use the derivatives of DNN outputs w.r.t. input coordinates to capture local correlations of data. As compared with classical TV on the original domain, the proposed TV on the neural domain (termed NeurTV) enjoys two advantages. First, NeurTV is not limited to meshgrid but is suitable for both meshgrid and non-meshgrid data. Second, NeurTV can more exactly capture local correlations across data for any direction and any order of derivatives attributed to the implicit and continuous nature of neural domain. We theoretically reinterpret NeurTV under the variational approximation framework, which allows us to build the connection between classical TV and NeurTV and inspires us to develop variants (e.g., NeurTV with arbitrary resolution and space-variant NeurTV). Extensive numerical experiments with meshgrid data (e.g., color and hyperspectral images) and non-meshgrid data (e.g., point clouds and spatial transcriptomics) showcase the effectiveness of the proposed methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.17241v1-abstract-full').style.display = 'none'; document.getElementById('2405.17241v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 94A08; 68U10; 68T45 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.05564">arXiv:2405.05564</a> <span> [<a href="https://arxiv.org/pdf/2405.05564">pdf</a>, <a href="https://arxiv.org/format/2405.05564">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Joint Edge Optimization Deep Unfolding Network for Accelerated MRI Reconstruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Cai%2C+Y">Yue Cai</a>, <a href="/search/eess?searchtype=author&query=Luo%2C+Y">Yu Luo</a>, <a href="/search/eess?searchtype=author&query=Ling%2C+J">Jie Ling</a>, <a href="/search/eess?searchtype=author&query=Yao%2C+S">Shun Yao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.05564v1-abstract-short" style="display: inline;"> Magnetic Resonance Imaging (MRI) is a widely used imaging technique, however it has the limitation of long scanning time. Though previous model-based and learning-based MRI reconstruction methods have shown promising performance, most of them have not fully utilized the edge prior of MR images, and there is still much room for improvement. In this paper, we build a joint edge optimization model th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.05564v1-abstract-full').style.display = 'inline'; document.getElementById('2405.05564v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.05564v1-abstract-full" style="display: none;"> Magnetic Resonance Imaging (MRI) is a widely used imaging technique, however it has the limitation of long scanning time. Though previous model-based and learning-based MRI reconstruction methods have shown promising performance, most of them have not fully utilized the edge prior of MR images, and there is still much room for improvement. In this paper, we build a joint edge optimization model that not only incorporates individual regularizers specific to both the MR image and the edges, but also enforces a co-regularizer to effectively establish a stronger correlation between them. Specifically, the edge information is defined through a non-edge probability map to guide the image reconstruction during the optimization process. Meanwhile, the regularizers pertaining to images and edges are incorporated into a deep unfolding network to automatically learn their respective inherent a-priori information.Numerical experiments, consisting of multi-coil and single-coil MRI data with different sampling schemes at a variety of sampling factors, demonstrate that the proposed method outperforms other compared methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.05564v1-abstract-full').style.display = 'none'; document.getElementById('2405.05564v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.04865">arXiv:2405.04865</a> <span> [<a href="https://arxiv.org/pdf/2405.04865">pdf</a>, <a href="https://arxiv.org/ps/2405.04865">ps</a>, <a href="https://arxiv.org/format/2405.04865">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Regime Learning for Differentiable Particle Filters </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Brady%2C+J">John-Joseph Brady</a>, <a href="/search/eess?searchtype=author&query=Luo%2C+Y">Yuhui Luo</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+W">Wenwu Wang</a>, <a href="/search/eess?searchtype=author&query=Elvira%2C+V">Victor Elvira</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yunpeng Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.04865v3-abstract-short" style="display: inline;"> Differentiable particle filters are an emerging class of models that combine sequential Monte Carlo techniques with the flexibility of neural networks to perform state space inference. This paper concerns the case where the system may switch between a finite set of state-space models, i.e. regimes. No prior approaches effectively learn both the individual regimes and the switching process simultan… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.04865v3-abstract-full').style.display = 'inline'; document.getElementById('2405.04865v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.04865v3-abstract-full" style="display: none;"> Differentiable particle filters are an emerging class of models that combine sequential Monte Carlo techniques with the flexibility of neural networks to perform state space inference. This paper concerns the case where the system may switch between a finite set of state-space models, i.e. regimes. No prior approaches effectively learn both the individual regimes and the switching process simultaneously. In this paper, we propose the neural network based regime learning differentiable particle filter (RLPF) to address this problem. We further design a training procedure for the RLPF and other related algorithms. We demonstrate competitive performance compared to the previous state-of-the-art algorithms on a pair of numerical experiments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.04865v3-abstract-full').style.display = 'none'; document.getElementById('2405.04865v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68T37 <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.6 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.15163">arXiv:2404.15163</a> <span> [<a href="https://arxiv.org/pdf/2404.15163">pdf</a>, <a href="https://arxiv.org/format/2404.15163">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Adaptive Mixed-Scale Feature Fusion Network for Blind AI-Generated Image Quality Assessment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhou%2C+T">Tianwei Zhou</a>, <a href="/search/eess?searchtype=author&query=Tan%2C+S">Songbai Tan</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+W">Wei Zhou</a>, <a href="/search/eess?searchtype=author&query=Luo%2C+Y">Yu Luo</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yuan-Gen Wang</a>, <a href="/search/eess?searchtype=author&query=Yue%2C+G">Guanghui Yue</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.15163v1-abstract-short" style="display: inline;"> With the increasing maturity of the text-to-image and image-to-image generative models, AI-generated images (AGIs) have shown great application potential in advertisement, entertainment, education, social media, etc. Although remarkable advancements have been achieved in generative models, very few efforts have been paid to design relevant quality assessment models. In this paper, we propose a nov… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.15163v1-abstract-full').style.display = 'inline'; document.getElementById('2404.15163v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.15163v1-abstract-full" style="display: none;"> With the increasing maturity of the text-to-image and image-to-image generative models, AI-generated images (AGIs) have shown great application potential in advertisement, entertainment, education, social media, etc. Although remarkable advancements have been achieved in generative models, very few efforts have been paid to design relevant quality assessment models. In this paper, we propose a novel blind image quality assessment (IQA) network, named AMFF-Net, for AGIs. AMFF-Net evaluates AGI quality from three dimensions, i.e., "visual quality", "authenticity", and "consistency". Specifically, inspired by the characteristics of the human visual system and motivated by the observation that "visual quality" and "authenticity" are characterized by both local and global aspects, AMFF-Net scales the image up and down and takes the scaled images and original-sized image as the inputs to obtain multi-scale features. After that, an Adaptive Feature Fusion (AFF) block is used to adaptively fuse the multi-scale features with learnable weights. In addition, considering the correlation between the image and prompt, AMFF-Net compares the semantic features from text encoder and image encoder to evaluate the text-to-image alignment. We carry out extensive experiments on three AGI quality assessment databases, and the experimental results show that our AMFF-Net obtains better performance than nine state-of-the-art blind IQA methods. The results of ablation experiments further demonstrate the effectiveness of the proposed multi-scale input strategy and AFF block. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.15163v1-abstract-full').style.display = 'none'; document.getElementById('2404.15163v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">IEEE Transactions on Broadcasting (TBC)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.10343">arXiv:2404.10343</a> <span> [<a href="https://arxiv.org/pdf/2404.10343">pdf</a>, <a href="https://arxiv.org/format/2404.10343">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> The Ninth NTIRE 2024 Efficient Super-Resolution Challenge Report </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ren%2C+B">Bin Ren</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yawei Li</a>, <a href="/search/eess?searchtype=author&query=Mehta%2C+N">Nancy Mehta</a>, <a href="/search/eess?searchtype=author&query=Timofte%2C+R">Radu Timofte</a>, <a href="/search/eess?searchtype=author&query=Yu%2C+H">Hongyuan Yu</a>, <a href="/search/eess?searchtype=author&query=Wan%2C+C">Cheng Wan</a>, <a href="/search/eess?searchtype=author&query=Hong%2C+Y">Yuxin Hong</a>, <a href="/search/eess?searchtype=author&query=Han%2C+B">Bingnan Han</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+Z">Zhuoyuan Wu</a>, <a href="/search/eess?searchtype=author&query=Zou%2C+Y">Yajun Zou</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+Y">Yuqing Liu</a>, <a href="/search/eess?searchtype=author&query=Li%2C+J">Jizhe Li</a>, <a href="/search/eess?searchtype=author&query=He%2C+K">Keji He</a>, <a href="/search/eess?searchtype=author&query=Fan%2C+C">Chao Fan</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+H">Heng Zhang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+X">Xiaolin Zhang</a>, <a href="/search/eess?searchtype=author&query=Yin%2C+X">Xuanwu Yin</a>, <a href="/search/eess?searchtype=author&query=Zuo%2C+K">Kunlong Zuo</a>, <a href="/search/eess?searchtype=author&query=Liao%2C+B">Bohao Liao</a>, <a href="/search/eess?searchtype=author&query=Xia%2C+P">Peizhe Xia</a>, <a href="/search/eess?searchtype=author&query=Peng%2C+L">Long Peng</a>, <a href="/search/eess?searchtype=author&query=Du%2C+Z">Zhibo Du</a>, <a href="/search/eess?searchtype=author&query=Di%2C+X">Xin Di</a>, <a href="/search/eess?searchtype=author&query=Li%2C+W">Wangkai Li</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yang Wang</a> , et al. (109 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.10343v2-abstract-short" style="display: inline;"> This paper provides a comprehensive review of the NTIRE 2024 challenge, focusing on efficient single-image super-resolution (ESR) solutions and their outcomes. The task of this challenge is to super-resolve an input image with a magnification factor of x4 based on pairs of low and corresponding high-resolution images. The primary objective is to develop networks that optimize various aspects such… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.10343v2-abstract-full').style.display = 'inline'; document.getElementById('2404.10343v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.10343v2-abstract-full" style="display: none;"> This paper provides a comprehensive review of the NTIRE 2024 challenge, focusing on efficient single-image super-resolution (ESR) solutions and their outcomes. The task of this challenge is to super-resolve an input image with a magnification factor of x4 based on pairs of low and corresponding high-resolution images. The primary objective is to develop networks that optimize various aspects such as runtime, parameters, and FLOPs, while still maintaining a peak signal-to-noise ratio (PSNR) of approximately 26.90 dB on the DIV2K_LSDIR_valid dataset and 26.99 dB on the DIV2K_LSDIR_test dataset. In addition, this challenge has 4 tracks including the main track (overall performance), sub-track 1 (runtime), sub-track 2 (FLOPs), and sub-track 3 (parameters). In the main track, all three metrics (ie runtime, FLOPs, and parameter count) were considered. The ranking of the main track is calculated based on a weighted sum-up of the scores of all other sub-tracks. In sub-track 1, the practical runtime performance of the submissions was evaluated, and the corresponding score was used to determine the ranking. In sub-track 2, the number of FLOPs was considered. The score calculated based on the corresponding FLOPs was used to determine the ranking. In sub-track 3, the number of parameters was considered. The score calculated based on the corresponding parameters was used to determine the ranking. RLFN is set as the baseline for efficiency measurement. The challenge had 262 registered participants, and 34 teams made valid submissions. They gauge the state-of-the-art in efficient single-image super-resolution. To facilitate the reproducibility of the challenge and enable other researchers to build upon these findings, the code and the pre-trained model of validated solutions are made publicly available at https://github.com/Amazingren/NTIRE2024_ESR/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.10343v2-abstract-full').style.display = 'none'; document.getElementById('2404.10343v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The report paper of NTIRE2024 Efficient Super-resolution, accepted by CVPRW2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.04947">arXiv:2404.04947</a> <span> [<a href="https://arxiv.org/pdf/2404.04947">pdf</a>, <a href="https://arxiv.org/format/2404.04947">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Gull: A Generative Multifunctional Audio Codec </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Luo%2C+Y">Yi Luo</a>, <a href="/search/eess?searchtype=author&query=Yu%2C+J">Jianwei Yu</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+H">Hangting Chen</a>, <a href="/search/eess?searchtype=author&query=Gu%2C+R">Rongzhi Gu</a>, <a href="/search/eess?searchtype=author&query=Weng%2C+C">Chao Weng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.04947v2-abstract-short" style="display: inline;"> We introduce Gull, a generative multifunctional audio codec. Gull is a general purpose neural audio compression and decompression model which can be applied to a wide range of tasks and applications such as real-time communication, audio super-resolution, and codec language models. The key components of Gull include (1) universal-sample-rate modeling via subband modeling schemes motivated by recen… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.04947v2-abstract-full').style.display = 'inline'; document.getElementById('2404.04947v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.04947v2-abstract-full" style="display: none;"> We introduce Gull, a generative multifunctional audio codec. Gull is a general purpose neural audio compression and decompression model which can be applied to a wide range of tasks and applications such as real-time communication, audio super-resolution, and codec language models. The key components of Gull include (1) universal-sample-rate modeling via subband modeling schemes motivated by recent progress in audio source separation, (2) gain-shape representations motivated by traditional audio codecs, (3) improved residual vector quantization modules, (4) elastic decoder network that enables user-defined model size and complexity during inference time, (5) built-in ability for audio super-resolution without the increase of bitrate. We compare Gull with existing traditional and neural audio codecs and show that Gull is able to achieve on par or better performance across various sample rates, bitrates and model complexities in both subjective and objective evaluation metrics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.04947v2-abstract-full').style.display = 'none'; document.getElementById('2404.04947v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Demo page: https://yluo42.github.io/Gull/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.01563">arXiv:2404.01563</a> <span> [<a href="https://arxiv.org/pdf/2404.01563">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Two-Phase Multi-Dose-Level PET Image Reconstruction with Dose Level Awareness </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Fei%2C+Y">Yuchen Fei</a>, <a href="/search/eess?searchtype=author&query=Luo%2C+Y">Yanmei Luo</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yan Wang</a>, <a href="/search/eess?searchtype=author&query=Cui%2C+J">Jiaqi Cui</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+Y">Yuanyuan Xu</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+J">Jiliu Zhou</a>, <a href="/search/eess?searchtype=author&query=Shen%2C+D">Dinggang Shen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.01563v2-abstract-short" style="display: inline;"> To obtain high-quality positron emission tomography (PET) while minimizing radiation exposure, a range of methods have been designed to reconstruct standard-dose PET (SPET) from corresponding low-dose PET (LPET) images. However, most current methods merely learn the mapping between single-dose-level LPET and SPET images, but omit the dose disparity of LPET images in clinical scenarios. In this pap… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.01563v2-abstract-full').style.display = 'inline'; document.getElementById('2404.01563v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.01563v2-abstract-full" style="display: none;"> To obtain high-quality positron emission tomography (PET) while minimizing radiation exposure, a range of methods have been designed to reconstruct standard-dose PET (SPET) from corresponding low-dose PET (LPET) images. However, most current methods merely learn the mapping between single-dose-level LPET and SPET images, but omit the dose disparity of LPET images in clinical scenarios. In this paper, to reconstruct high-quality SPET images from multi-dose-level LPET images, we design a novel two-phase multi-dose-level PET reconstruction algorithm with dose level awareness, containing a pre-training phase and a SPET prediction phase. Specifically, the pre-training phase is devised to explore both fine-grained discriminative features and effective semantic representation. The SPET prediction phase adopts a coarse prediction network utilizing pre-learned dose level prior to generate preliminary result, and a refinement network to precisely preserve the details. Experiments on MICCAI 2022 Ultra-low Dose PET Imaging Challenge Dataset have demonstrated the superiority of our method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.01563v2-abstract-full').style.display = 'none'; document.getElementById('2404.01563v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ISBI2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.11092">arXiv:2403.11092</a> <span> [<a href="https://arxiv.org/pdf/2403.11092">pdf</a>, <a href="https://arxiv.org/format/2403.11092">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Lost in Translation? Translation Errors and Challenges for Fair Assessment of Text-to-Image Models on Multilingual Concepts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Saxon%2C+M">Michael Saxon</a>, <a href="/search/eess?searchtype=author&query=Luo%2C+Y">Yiran Luo</a>, <a href="/search/eess?searchtype=author&query=Levy%2C+S">Sharon Levy</a>, <a href="/search/eess?searchtype=author&query=Baral%2C+C">Chitta Baral</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+Y">Yezhou Yang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+W+Y">William Yang Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.11092v1-abstract-short" style="display: inline;"> Benchmarks of the multilingual capabilities of text-to-image (T2I) models compare generated images prompted in a test language to an expected image distribution over a concept set. One such benchmark, "Conceptual Coverage Across Languages" (CoCo-CroLa), assesses the tangible noun inventory of T2I models by prompting them to generate pictures from a concept list translated to seven languages and co… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.11092v1-abstract-full').style.display = 'inline'; document.getElementById('2403.11092v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.11092v1-abstract-full" style="display: none;"> Benchmarks of the multilingual capabilities of text-to-image (T2I) models compare generated images prompted in a test language to an expected image distribution over a concept set. One such benchmark, "Conceptual Coverage Across Languages" (CoCo-CroLa), assesses the tangible noun inventory of T2I models by prompting them to generate pictures from a concept list translated to seven languages and comparing the output image populations. Unfortunately, we find that this benchmark contains translation errors of varying severity in Spanish, Japanese, and Chinese. We provide corrections for these errors and analyze how impactful they are on the utility and validity of CoCo-CroLa as a benchmark. We reassess multiple baseline T2I models with the revisions, compare the outputs elicited under the new translations to those conditioned on the old, and show that a correction's impactfulness on the image-domain benchmark results can be predicted in the text domain with similarity scores. Our findings will guide the future development of T2I multilinguality metrics by providing analytical tools for practical translation decisions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.11092v1-abstract-full').style.display = 'none'; document.getElementById('2403.11092v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NAACL 2024 Main Conference</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.09353">arXiv:2403.09353</a> <span> [<a href="https://arxiv.org/pdf/2403.09353">pdf</a>, <a href="https://arxiv.org/ps/2403.09353">ps</a>, <a href="https://arxiv.org/format/2403.09353">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/LCOMM.2023.3344599">10.1109/LCOMM.2023.3344599 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Intelligent Reflecting Surfaces vs. Full-Duplex Relays: A Comparison in the Air </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ding%2C+Q">Qian Ding</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+J">Jie Yang</a>, <a href="/search/eess?searchtype=author&query=Luo%2C+Y">Yang Luo</a>, <a href="/search/eess?searchtype=author&query=Luo%2C+C">Chunbo Luo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.09353v1-abstract-short" style="display: inline;"> This letter aims to provide a fundamental analytical comparison for the two major types of relaying methods: intelligent reflecting surfaces and full-duplex relays, particularly focusing on unmanned aerial vehicle communication scenarios. Both amplify-and-forward and decode-and-forward relaying schemes are included in the comparison. In addition, optimal 3D UAV deployment and minimum transmit powe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.09353v1-abstract-full').style.display = 'inline'; document.getElementById('2403.09353v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.09353v1-abstract-full" style="display: none;"> This letter aims to provide a fundamental analytical comparison for the two major types of relaying methods: intelligent reflecting surfaces and full-duplex relays, particularly focusing on unmanned aerial vehicle communication scenarios. Both amplify-and-forward and decode-and-forward relaying schemes are included in the comparison. In addition, optimal 3D UAV deployment and minimum transmit power under the quality of service constraint are derived. Our numerical results show that IRSs of medium size exhibit comparable performance to AF relays, meanwhile outperforming DF relays under extremely large surface size and high data rates. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.09353v1-abstract-full').style.display = 'none'; document.getElementById('2403.09353v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> IEEE Communications Letters, vol. 28, no. 2, pp. 397-401, Feb. 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.09223">arXiv:2403.09223</a> <span> [<a href="https://arxiv.org/pdf/2403.09223">pdf</a>, <a href="https://arxiv.org/format/2403.09223">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> MCformer: Multivariate Time Series Forecasting with Mixed-Channels Transformer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Han%2C+W">Wenyong Han</a>, <a href="/search/eess?searchtype=author&query=Member%2C+T+Z">Tao Zhu Member</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+L">Liming Chen</a>, <a href="/search/eess?searchtype=author&query=Ning%2C+H">Huansheng Ning</a>, <a href="/search/eess?searchtype=author&query=Luo%2C+Y">Yang Luo</a>, <a href="/search/eess?searchtype=author&query=Wan%2C+Y">Yaping Wan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.09223v1-abstract-short" style="display: inline;"> The massive generation of time-series data by largescale Internet of Things (IoT) devices necessitates the exploration of more effective models for multivariate time-series forecasting. In previous models, there was a predominant use of the Channel Dependence (CD) strategy (where each channel represents a univariate sequence). Current state-of-the-art (SOTA) models primarily rely on the Channel In… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.09223v1-abstract-full').style.display = 'inline'; document.getElementById('2403.09223v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.09223v1-abstract-full" style="display: none;"> The massive generation of time-series data by largescale Internet of Things (IoT) devices necessitates the exploration of more effective models for multivariate time-series forecasting. In previous models, there was a predominant use of the Channel Dependence (CD) strategy (where each channel represents a univariate sequence). Current state-of-the-art (SOTA) models primarily rely on the Channel Independence (CI) strategy. The CI strategy treats all channels as a single channel, expanding the dataset to improve generalization performance and avoiding inter-channel correlation that disrupts long-term features. However, the CI strategy faces the challenge of interchannel correlation forgetting. To address this issue, we propose an innovative Mixed Channels strategy, combining the data expansion advantages of the CI strategy with the ability to counteract inter-channel correlation forgetting. Based on this strategy, we introduce MCformer, a multivariate time-series forecasting model with mixed channel features. The model blends a specific number of channels, leveraging an attention mechanism to effectively capture inter-channel correlation information when modeling long-term features. Experimental results demonstrate that the Mixed Channels strategy outperforms pure CI strategy in multivariate time-series forecasting tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.09223v1-abstract-full').style.display = 'none'; document.getElementById('2403.09223v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.08680">arXiv:2403.08680</a> <span> [<a href="https://arxiv.org/pdf/2403.08680">pdf</a>, <a href="https://arxiv.org/format/2403.08680">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Towards the THz Networks in the 6G Era </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ding%2C+Q">Qian Ding</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+J">Jie Yang</a>, <a href="/search/eess?searchtype=author&query=Luo%2C+Y">Yang Luo</a>, <a href="/search/eess?searchtype=author&query=Luo%2C+C">Chunbo Luo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.08680v1-abstract-short" style="display: inline;"> This commentary dedicates to envision what role THz is going to play in the coming human-centric 6G era. Three distinct THz network types including outdoor, indoor, and body area networks are discussed, with an emphasis on their capabilities in human body detection. Synthesizing these networks will unlock a bunch of fascinating applications across industrial, biomedical and entertainment fields, s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.08680v1-abstract-full').style.display = 'inline'; document.getElementById('2403.08680v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.08680v1-abstract-full" style="display: none;"> This commentary dedicates to envision what role THz is going to play in the coming human-centric 6G era. Three distinct THz network types including outdoor, indoor, and body area networks are discussed, with an emphasis on their capabilities in human body detection. Synthesizing these networks will unlock a bunch of fascinating applications across industrial, biomedical and entertainment fields, significantly enhancing the quality of human life. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.08680v1-abstract-full').style.display = 'none'; document.getElementById('2403.08680v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.04626">arXiv:2403.04626</a> <span> [<a href="https://arxiv.org/pdf/2403.04626">pdf</a>, <a href="https://arxiv.org/format/2403.04626">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> MedFLIP: Medical Vision-and-Language Self-supervised Fast Pre-Training with Masked Autoencoder </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+L">Lei Li</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+T">Tianfang Zhang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+X">Xinglin Zhang</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+J">Jiaqi Liu</a>, <a href="/search/eess?searchtype=author&query=Ma%2C+B">Bingqi Ma</a>, <a href="/search/eess?searchtype=author&query=Luo%2C+Y">Yan Luo</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+T">Tao Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.04626v2-abstract-short" style="display: inline;"> Within the domain of medical analysis, extensive research has explored the potential of mutual learning between Masked Autoencoders(MAEs) and multimodal data. However, the impact of MAEs on intermodality remains a key challenge. We introduce MedFLIP, a Fast Language-Image Pre-training method for Medical analysis. We explore MAEs for zero-shot learning with crossed domains, which enhances the model… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.04626v2-abstract-full').style.display = 'inline'; document.getElementById('2403.04626v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.04626v2-abstract-full" style="display: none;"> Within the domain of medical analysis, extensive research has explored the potential of mutual learning between Masked Autoencoders(MAEs) and multimodal data. However, the impact of MAEs on intermodality remains a key challenge. We introduce MedFLIP, a Fast Language-Image Pre-training method for Medical analysis. We explore MAEs for zero-shot learning with crossed domains, which enhances the model's ability to learn from limited data, a common scenario in medical diagnostics. We verify that masking an image does not affect inter-modal learning. Furthermore, we propose the SVD loss to enhance the representation learning for characteristics of medical images, aiming to improve classification accuracy by leveraging the structural intricacies of such data. Our theory posits that masking encourages semantic preservation, robust feature extraction, regularization, domain adaptation, and invariance learning. Lastly, we validate using language will improve the zero-shot performance for the medical image analysis. MedFLIP's scaling of the masking process marks an advancement in the field, offering a pathway to rapid and precise medical image analysis without the traditional computational bottlenecks. Through experiments and validation, MedFLIP demonstrates efficient performance improvements, helps for future research and application in medical diagnostics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.04626v2-abstract-full').style.display = 'none'; document.getElementById('2403.04626v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.03809">arXiv:2403.03809</a> <span> [<a href="https://arxiv.org/pdf/2403.03809">pdf</a>, <a href="https://arxiv.org/ps/2403.03809">ps</a>, <a href="https://arxiv.org/format/2403.03809">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Variational Bayesian Learning based Joint Localization and Path Loss Exponent with Distance-dependent Noise in Wireless Sensor Network </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yunfei Li</a>, <a href="/search/eess?searchtype=author&query=Luo%2C+Y">Yiting Luo</a>, <a href="/search/eess?searchtype=author&query=Tan%2C+W">Weiqiang Tan</a>, <a href="/search/eess?searchtype=author&query=Li%2C+C">Chunguo Li</a>, <a href="/search/eess?searchtype=author&query=Ma%2C+S">Shaodan Ma</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+G">Guanghua Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.03809v3-abstract-short" style="display: inline;"> This paper focuses on the challenge of jointly optimizing location and path loss exponent (PLE) in distance-dependent noise. Departing from the conventional independent noise model used in localization and path loss exponent estimation problems, we consider a more realistic model incorporating distance-dependent noise variance, as revealed in recent theoretical analyses and experimental results. T… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.03809v3-abstract-full').style.display = 'inline'; document.getElementById('2403.03809v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.03809v3-abstract-full" style="display: none;"> This paper focuses on the challenge of jointly optimizing location and path loss exponent (PLE) in distance-dependent noise. Departing from the conventional independent noise model used in localization and path loss exponent estimation problems, we consider a more realistic model incorporating distance-dependent noise variance, as revealed in recent theoretical analyses and experimental results. The distance-dependent noise introduces a complex noise model with unknown noise power and PLE, resulting in an exceptionally challenging non-convex and nonlinear optimization problem. In this study, we address a joint localization and path loss exponent estimation problem encompassing distance-dependent noise, unknown parameters, and uncertainties in sensor node locations. To surmount the intractable nonlinear and non-convex objective function inherent in the problem, we introduce a variational Bayesian learning-based framework that enables the joint optimization of localization, path loss exponent, and reference noise parameters by leveraging an effective approximation to the true posterior distribution. Furthermore, the proposed joint learning algorithm provides an iterative closed-form solution and exhibits superior performance in terms of computational complexity compared to existing algorithms. Computer simulation results demonstrate that the proposed algorithm approaches the performance of the Bayesian Cramer-Rao bound (BCRB), achieves localization performance comparable to the (maximum likelihood-Gaussian message passing) ML-GMP algorithm in some cases, and outperforms the other comparison algorithm in all cases. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.03809v3-abstract-full').style.display = 'none'; document.getElementById('2403.03809v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.01265">arXiv:2403.01265</a> <span> [<a href="https://arxiv.org/pdf/2403.01265">pdf</a>, <a href="https://arxiv.org/format/2403.01265">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Smooth Computation without Input Delay: Robust Tube-Based Model Predictive Control for Robot Manipulator Planning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Luo%2C+Y">Yu Luo</a>, <a href="/search/eess?searchtype=author&query=Sima%2C+Q">Qie Sima</a>, <a href="/search/eess?searchtype=author&query=Ji%2C+T">Tianying Ji</a>, <a href="/search/eess?searchtype=author&query=Sun%2C+F">Fuchun Sun</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+H">Huaping Liu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+J">Jianwei Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.01265v3-abstract-short" style="display: inline;"> Model Predictive Control (MPC) has exhibited remarkable capabilities in optimizing objectives and meeting constraints. However, the substantial computational burden associated with solving the Optimal Control Problem (OCP) at each triggering instant introduces significant delays between state sampling and control application. These delays limit the practicality of MPC in resource-constrained syste… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.01265v3-abstract-full').style.display = 'inline'; document.getElementById('2403.01265v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.01265v3-abstract-full" style="display: none;"> Model Predictive Control (MPC) has exhibited remarkable capabilities in optimizing objectives and meeting constraints. However, the substantial computational burden associated with solving the Optimal Control Problem (OCP) at each triggering instant introduces significant delays between state sampling and control application. These delays limit the practicality of MPC in resource-constrained systems when engaging in complex tasks. The intuition to address this issue in this paper is that by predicting the successor state, the controller can solve the OCP one time step ahead of time thus avoiding the delay of the next action. To this end, we compute deviations between real and nominal system states, predicting forthcoming real states as initial conditions for the imminent OCP solution. Anticipatory computation stores optimal control based on current nominal states, thus mitigating the delay effects. Additionally, we establish an upper bound for linearization error, effectively linearizing the nonlinear system, reducing OCP complexity, and enhancing response speed. We provide empirical validation through two numerical simulations and corresponding real-world robot tasks, demonstrating significant performance improvements and augmented response speed (up to $90\%$) resulting from the seamless integration of our proposed approach compared to conventional time-triggered MPC strategies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.01265v3-abstract-full').style.display = 'none'; document.getElementById('2403.01265v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">arXiv admin note: text overlap with arXiv:2103.09693</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.01093">arXiv:2403.01093</a> <span> [<a href="https://arxiv.org/pdf/2403.01093">pdf</a>, <a href="https://arxiv.org/format/2403.01093">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Variational Bayesian Learning Based Localization and Channel Reconstruction in RIS-aided Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yunfei Li</a>, <a href="/search/eess?searchtype=author&query=Luo%2C+Y">Yiting Luo</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+X">Xianda Wu</a>, <a href="/search/eess?searchtype=author&query=Shi%2C+Z">Zheng Shi</a>, <a href="/search/eess?searchtype=author&query=Ma%2C+S">Shaodan Ma</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+G">Guanghua Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.01093v1-abstract-short" style="display: inline;"> The emerging immersive and autonomous services have posed stringent requirements on both communications and localization. By considering the great potential of reconfigurable intelligent surface (RIS), this paper focuses on the joint channel estimation and localization for RIS-aided wireless systems. As opposed to existing works that treat channel estimation and localization independently, this pa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.01093v1-abstract-full').style.display = 'inline'; document.getElementById('2403.01093v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.01093v1-abstract-full" style="display: none;"> The emerging immersive and autonomous services have posed stringent requirements on both communications and localization. By considering the great potential of reconfigurable intelligent surface (RIS), this paper focuses on the joint channel estimation and localization for RIS-aided wireless systems. As opposed to existing works that treat channel estimation and localization independently, this paper exploits the intrinsic coupling and nonlinear relationships between the channel parameters and user location for enhancement of both localization and channel reconstruction. By noticing the non-convex, nonlinear objective function and the sparser angle pattern, a variational Bayesian learning-based framework is developed to jointly estimate the channel parameters and user location through leveraging an effective approximation of the posterior distribution. The proposed framework is capable of unifying near-field and far-field scenarios owing to exploitation of sparsity of the angular domain. Since the joint channel and location estimation problem has a closed-form solution in each iteration, our proposed iterative algorithm performs better than the conventional particle swarm optimization (PSO) and maximum likelihood (ML) based ones in terms of computational complexity. Simulations demonstrate that the proposed algorithm almost reaches the Bayesian Cramer-Rao bound (BCRB) and achieves a superior estimation accuracy by comparing to the PSO and the ML algorithms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.01093v1-abstract-full').style.display = 'none'; document.getElementById('2403.01093v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.15897">arXiv:2402.15897</a> <span> [<a href="https://arxiv.org/pdf/2402.15897">pdf</a>, <a href="https://arxiv.org/format/2402.15897">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> MMW-Carry: Enhancing Carry Object Detection through Millimeter-Wave Radar-Camera Fusion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Gao%2C+X">Xiangyu Gao</a>, <a href="/search/eess?searchtype=author&query=Luo%2C+Y">Youchen Luo</a>, <a href="/search/eess?searchtype=author&query=Alansari%2C+A">Ali Alansari</a>, <a href="/search/eess?searchtype=author&query=Sun%2C+Y">Yaping Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.15897v1-abstract-short" style="display: inline;"> This paper introduces MMW-Carry, a system designed to predict the probability of individuals carrying various objects using millimeter-wave radar signals, complemented by camera input. The primary goal of MMW-Carry is to provide a rapid and cost-effective preliminary screening solution, specifically tailored for non-super-sensitive scenarios. Overall, MMW-Carry achieves significant advancements in… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.15897v1-abstract-full').style.display = 'inline'; document.getElementById('2402.15897v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.15897v1-abstract-full" style="display: none;"> This paper introduces MMW-Carry, a system designed to predict the probability of individuals carrying various objects using millimeter-wave radar signals, complemented by camera input. The primary goal of MMW-Carry is to provide a rapid and cost-effective preliminary screening solution, specifically tailored for non-super-sensitive scenarios. Overall, MMW-Carry achieves significant advancements in two crucial aspects. Firstly, it addresses localization challenges in complex indoor environments caused by multi-path reflections, enhancing the system's overall robustness. This is accomplished by the integration of camera-based human detection, tracking, and the radar-camera plane transformation for obtaining subjects' spatial occupancy region, followed by a zooming-in operation on the radar images. Secondly, the system performance is elevated by leveraging long-term observation of a subject. This is realized through the intelligent fusion of neural network results from multiple different-view radar images of an in-track moving subject and their carried objects, facilitated by a proposed knowledge-transfer module. Our experiment results demonstrate that MMW-Carry detects objects with an average error rate of 25.22\% false positives and a 21.71\% missing rate for individuals moving randomly in a large indoor space, carrying the common-in-everyday-life objects, both in open carry or concealed ways. These findings affirm MMW-Carry's potential to extend its capabilities to detect a broader range of objects for diverse applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.15897v1-abstract-full').style.display = 'none'; document.getElementById('2402.15897v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.00708">arXiv:2401.00708</a> <span> [<a href="https://arxiv.org/pdf/2401.00708">pdf</a>, <a href="https://arxiv.org/format/2401.00708">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Revisiting Nonlocal Self-Similarity from Continuous Representation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Luo%2C+Y">Yisi Luo</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+X">Xile Zhao</a>, <a href="/search/eess?searchtype=author&query=Meng%2C+D">Deyu Meng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.00708v1-abstract-short" style="display: inline;"> Nonlocal self-similarity (NSS) is an important prior that has been successfully applied in multi-dimensional data processing tasks, e.g., image and video recovery. However, existing NSS-based methods are solely suitable for meshgrid data such as images and videos, but are not suitable for emerging off-meshgrid data, e.g., point cloud and climate data. In this work, we revisit the NSS from the cont… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.00708v1-abstract-full').style.display = 'inline'; document.getElementById('2401.00708v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.00708v1-abstract-full" style="display: none;"> Nonlocal self-similarity (NSS) is an important prior that has been successfully applied in multi-dimensional data processing tasks, e.g., image and video recovery. However, existing NSS-based methods are solely suitable for meshgrid data such as images and videos, but are not suitable for emerging off-meshgrid data, e.g., point cloud and climate data. In this work, we revisit the NSS from the continuous representation perspective and propose a novel Continuous Representation-based NonLocal method (termed as CRNL), which has two innovative features as compared with classical nonlocal methods. First, based on the continuous representation, our CRNL unifies the measure of self-similarity for on-meshgrid and off-meshgrid data and thus is naturally suitable for both of them. Second, the nonlocal continuous groups can be more compactly and efficiently represented by the coupled low-rank function factorization, which simultaneously exploits the similarity within each group and across different groups, while classical nonlocal methods neglect the similarity across groups. This elaborately designed coupled mechanism allows our method to enjoy favorable performance over conventional NSS methods in terms of both effectiveness and efficiency. Extensive multi-dimensional data processing experiments on-meshgrid (e.g., image inpainting and image denoising) and off-meshgrid (e.g., climate data prediction and point cloud recovery) validate the versatility, effectiveness, and efficiency of our CRNL as compared with state-of-the-art methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.00708v1-abstract-full').style.display = 'none'; document.getElementById('2401.00708v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.15659">arXiv:2312.15659</a> <span> [<a href="https://arxiv.org/pdf/2312.15659">pdf</a>, <a href="https://arxiv.org/format/2312.15659">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Perceptual Quality Assessment for Video Frame Interpolation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Han%2C+J">Jinliang Han</a>, <a href="/search/eess?searchtype=author&query=Min%2C+X">Xiongkuo Min</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+Y">Yixuan Gao</a>, <a href="/search/eess?searchtype=author&query=Jia%2C+J">Jun Jia</a>, <a href="/search/eess?searchtype=author&query=Sun%2C+L">Lei Sun</a>, <a href="/search/eess?searchtype=author&query=Cao%2C+Z">Zuowei Cao</a>, <a href="/search/eess?searchtype=author&query=Luo%2C+Y">Yonglin Luo</a>, <a href="/search/eess?searchtype=author&query=Zhai%2C+G">Guangtao Zhai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.15659v1-abstract-short" style="display: inline;"> The quality of frames is significant for both research and application of video frame interpolation (VFI). In recent VFI studies, the methods of full-reference image quality assessment have generally been used to evaluate the quality of VFI frames. However, high frame rate reference videos, necessities for the full-reference methods, are difficult to obtain in most applications of VFI. To evaluate… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.15659v1-abstract-full').style.display = 'inline'; document.getElementById('2312.15659v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.15659v1-abstract-full" style="display: none;"> The quality of frames is significant for both research and application of video frame interpolation (VFI). In recent VFI studies, the methods of full-reference image quality assessment have generally been used to evaluate the quality of VFI frames. However, high frame rate reference videos, necessities for the full-reference methods, are difficult to obtain in most applications of VFI. To evaluate the quality of VFI frames without reference videos, a no-reference perceptual quality assessment method is proposed in this paper. This method is more compatible with VFI application and the evaluation scores from it are consistent with human subjective opinions. A new quality assessment dataset for VFI was constructed through subjective experiments firstly, to assess the opinion scores of interpolated frames. The dataset was created from triplets of frames extracted from high-quality videos using 9 state-of-the-art VFI algorithms. The proposed method evaluates the perceptual coherence of frames incorporating the original pair of VFI inputs. Specifically, the method applies a triplet network architecture, including three parallel feature pipelines, to extract the deep perceptual features of the interpolated frame as well as the original pair of frames. Coherence similarities of the two-way parallel features are jointly calculated and optimized as a perceptual metric. In the experiments, both full-reference and no-reference quality assessment methods were tested on the new quality dataset. The results show that the proposed method achieves the best performance among all compared quality assessment methods on the dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.15659v1-abstract-full').style.display = 'none'; document.getElementById('2312.15659v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 4 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.4.0 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.13744">arXiv:2312.13744</a> <span> [<a href="https://arxiv.org/pdf/2312.13744">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Modelling of Networked Measuring Systems -- From White-Box Models to Data Based Approaches </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Sommer%2C+K">Klaus-Dieter Sommer</a>, <a href="/search/eess?searchtype=author&query=Harris%2C+P">Peter Harris</a>, <a href="/search/eess?searchtype=author&query=Eichst%C3%A4dt%2C+S">Sascha Eichst盲dt</a>, <a href="/search/eess?searchtype=author&query=F%C3%BCssl%2C+R">Roland F眉ssl</a>, <a href="/search/eess?searchtype=author&query=Dorst%2C+T">Tanja Dorst</a>, <a href="/search/eess?searchtype=author&query=Sch%C3%BCtze%2C+A">Andreas Sch眉tze</a>, <a href="/search/eess?searchtype=author&query=Heizmann%2C+M">Michael Heizmann</a>, <a href="/search/eess?searchtype=author&query=Schiering%2C+N">Nadine Schiering</a>, <a href="/search/eess?searchtype=author&query=Maier%2C+A">Andreas Maier</a>, <a href="/search/eess?searchtype=author&query=Luo%2C+Y">Yuhui Luo</a>, <a href="/search/eess?searchtype=author&query=Tachtatzis%2C+C">Christos Tachtatzis</a>, <a href="/search/eess?searchtype=author&query=Andonovic%2C+I">Ivan Andonovic</a>, <a href="/search/eess?searchtype=author&query=Gourlay%2C+G">Gordon Gourlay</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.13744v1-abstract-short" style="display: inline;"> Mathematical modelling is at the core of metrology as it transforms raw measured data into useful measurement results. A model captures the relationship between the measurand and all relevant quantities on which the measurand depends, and is used to design measuring systems, analyse measured data, make inferences and predictions, and is the basis for evaluating measurement uncertainties. Tradition… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.13744v1-abstract-full').style.display = 'inline'; document.getElementById('2312.13744v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.13744v1-abstract-full" style="display: none;"> Mathematical modelling is at the core of metrology as it transforms raw measured data into useful measurement results. A model captures the relationship between the measurand and all relevant quantities on which the measurand depends, and is used to design measuring systems, analyse measured data, make inferences and predictions, and is the basis for evaluating measurement uncertainties. Traditional modelling approaches are typically analytical, for example, based on principles of physics. But with the increasing use of digital technologies, large sensor networks and powerful computing hardware, these traditional approaches are being replaced more and more by data-driven methods. This paradigm shift holds true in particular for the digital future of measurement in all spheres of our lives and the environment, where data provided by large and complex interconnected systems of sensors are to be analysed. Additionally, there is a requirement for existing guidelines and standards in metrology to take the paradigm shift into account. In this paper we lay the foundation for the development from traditional to data-driven modelling approaches. We identify key aspects from traditional modelling approaches and discuss their transformation to data-driven modelling. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.13744v1-abstract-full').style.display = 'none'; document.getElementById('2312.13744v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.10381">arXiv:2312.10381</a> <span> [<a href="https://arxiv.org/pdf/2312.10381">pdf</a>, <a href="https://arxiv.org/format/2312.10381">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> SECap: Speech Emotion Captioning with Large Language Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Xu%2C+Y">Yaoxun Xu</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+H">Hangting Chen</a>, <a href="/search/eess?searchtype=author&query=Yu%2C+J">Jianwei Yu</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+Q">Qiaochu Huang</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+Z">Zhiyong Wu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+S">Shixiong Zhang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+G">Guangzhi Li</a>, <a href="/search/eess?searchtype=author&query=Luo%2C+Y">Yi Luo</a>, <a href="/search/eess?searchtype=author&query=Gu%2C+R">Rongzhi Gu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.10381v3-abstract-short" style="display: inline;"> Speech emotions are crucial in human communication and are extensively used in fields like speech synthesis and natural language understanding. Most prior studies, such as speech emotion recognition, have categorized speech emotions into a fixed set of classes. Yet, emotions expressed in human speech are often complex, and categorizing them into predefined groups can be insufficient to adequately… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.10381v3-abstract-full').style.display = 'inline'; document.getElementById('2312.10381v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.10381v3-abstract-full" style="display: none;"> Speech emotions are crucial in human communication and are extensively used in fields like speech synthesis and natural language understanding. Most prior studies, such as speech emotion recognition, have categorized speech emotions into a fixed set of classes. Yet, emotions expressed in human speech are often complex, and categorizing them into predefined groups can be insufficient to adequately represent speech emotions. On the contrary, describing speech emotions directly by means of natural language may be a more effective approach. Regrettably, there are not many studies available that have focused on this direction. Therefore, this paper proposes a speech emotion captioning framework named SECap, aiming at effectively describing speech emotions using natural language. Owing to the impressive capabilities of large language models in language comprehension and text generation, SECap employs LLaMA as the text decoder to allow the production of coherent speech emotion captions. In addition, SECap leverages HuBERT as the audio encoder to extract general speech features and Q-Former as the Bridge-Net to provide LLaMA with emotion-related speech features. To accomplish this, Q-Former utilizes mutual information learning to disentangle emotion-related speech features and speech contents, while implementing contrastive learning to extract more emotion-related speech features. The results of objective and subjective evaluations demonstrate that: 1) the SECap framework outperforms the HTSAT-BART baseline in all objective evaluations; 2) SECap can generate high-quality speech emotion captions that attain performance on par with human annotators in subjective mean opinion score tests. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.10381v3-abstract-full').style.display = 'none'; document.getElementById('2312.10381v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by AAAI 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.05279">arXiv:2312.05279</a> <span> [<a href="https://arxiv.org/pdf/2312.05279">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Quantitative perfusion maps using a novelty spatiotemporal convolutional neural network </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Cao%2C+A">Anbo Cao</a>, <a href="/search/eess?searchtype=author&query=Le%2C+P">Pin-Yu Le</a>, <a href="/search/eess?searchtype=author&query=Qie%2C+Z">Zhonghui Qie</a>, <a href="/search/eess?searchtype=author&query=Hassan%2C+H">Haseeb Hassan</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+Y">Yingwei Guo</a>, <a href="/search/eess?searchtype=author&query=Zaman%2C+A">Asim Zaman</a>, <a href="/search/eess?searchtype=author&query=Lu%2C+J">Jiaxi Lu</a>, <a href="/search/eess?searchtype=author&query=Zeng%2C+X">Xueqiang Zeng</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+H">Huihui Yang</a>, <a href="/search/eess?searchtype=author&query=Miao%2C+X">Xiaoqiang Miao</a>, <a href="/search/eess?searchtype=author&query=Han%2C+T">Taiyu Han</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+G">Guangtao Huang</a>, <a href="/search/eess?searchtype=author&query=Kang%2C+Y">Yan Kang</a>, <a href="/search/eess?searchtype=author&query=Luo%2C+Y">Yu Luo</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+J">Jia Guo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.05279v1-abstract-short" style="display: inline;"> Dynamic susceptibility contrast magnetic resonance imaging (DSC-MRI) is widely used to evaluate acute ischemic stroke to distinguish salvageable tissue and infarct core. For this purpose, traditional methods employ deconvolution techniques, like singular value decomposition, which are known to be vulnerable to noise, potentially distorting the derived perfusion parameters. However, deep learning t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.05279v1-abstract-full').style.display = 'inline'; document.getElementById('2312.05279v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.05279v1-abstract-full" style="display: none;"> Dynamic susceptibility contrast magnetic resonance imaging (DSC-MRI) is widely used to evaluate acute ischemic stroke to distinguish salvageable tissue and infarct core. For this purpose, traditional methods employ deconvolution techniques, like singular value decomposition, which are known to be vulnerable to noise, potentially distorting the derived perfusion parameters. However, deep learning technology could leverage it, which can accurately estimate clinical perfusion parameters compared to traditional clinical approaches. Therefore, this study presents a perfusion parameters estimation network that considers spatial and temporal information, the Spatiotemporal Network (ST-Net), for the first time. The proposed network comprises a designed physical loss function to enhance model performance further. The results indicate that the network can accurately estimate perfusion parameters, including cerebral blood volume (CBV), cerebral blood flow (CBF), and time to maximum of the residual function (Tmax). The structural similarity index (SSIM) mean values for CBV, CBF, and Tmax parameters were 0.952, 0.943, and 0.863, respectively. The DICE score for the hypo-perfused region reached 0.859, demonstrating high consistency. The proposed model also maintains time efficiency, closely approaching the performance of commercial gold-standard software. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.05279v1-abstract-full').style.display = 'none'; document.getElementById('2312.05279v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.03464">arXiv:2312.03464</a> <span> [<a href="https://arxiv.org/pdf/2312.03464">pdf</a>, <a href="https://arxiv.org/format/2312.03464">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Subnetwork-to-go: Elastic Neural Network with Dynamic Training and Customizable Inference </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+K">Kai Li</a>, <a href="/search/eess?searchtype=author&query=Luo%2C+Y">Yi Luo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.03464v1-abstract-short" style="display: inline;"> Deploying neural networks to different devices or platforms is in general challenging, especially when the model size is large or model complexity is high. Although there exist ways for model pruning or distillation, it is typically required to perform a full round of model training or finetuning procedure in order to obtain a smaller model that satisfies the model size or complexity constraints.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.03464v1-abstract-full').style.display = 'inline'; document.getElementById('2312.03464v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.03464v1-abstract-full" style="display: none;"> Deploying neural networks to different devices or platforms is in general challenging, especially when the model size is large or model complexity is high. Although there exist ways for model pruning or distillation, it is typically required to perform a full round of model training or finetuning procedure in order to obtain a smaller model that satisfies the model size or complexity constraints. Motivated by recent works on dynamic neural networks, we propose a simple way to train a large network and flexibly extract a subnetwork from it given a model size or complexity constraint during inference. We introduce a new way to allow a large model to be trained with dynamic depth and width during the training phase, and after the large model is trained we can select a subnetwork from it with arbitrary depth and width during the inference phase with a relatively better performance compared to training the subnetwork independently from scratch. Experiment results on a music source separation model show that our proposed method can effectively improve the separation performance across different subnetwork sizes and complexities with a single large model, and training the large model takes significantly shorter time than training all the different subnetworks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.03464v1-abstract-full').style.display = 'none'; document.getElementById('2312.03464v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 3 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.01781">arXiv:2311.01781</a> <span> [<a href="https://arxiv.org/pdf/2311.01781">pdf</a>, <a href="https://arxiv.org/format/2311.01781">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Passive Handwriting Tracking via Weak mmWave Communication Signals </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yu%2C+C">Chao Yu</a>, <a href="/search/eess?searchtype=author&query=Luo%2C+Y">Yan Luo</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+R">Renqi Chen</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+R">Rui Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.01781v1-abstract-short" style="display: inline;"> In this letter, a cooperative sensing framework based on millimeter wave (mmWave) communication systems is proposed to detect tiny motions with a millimeter-level resolution. Particularly, the cooperative sensing framework is facilitated with one transmitter and two receivers. There are two radio frequency (RF) chains at each receiver. Hence, the Doppler effect due to the tiny motions can be detec… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.01781v1-abstract-full').style.display = 'inline'; document.getElementById('2311.01781v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.01781v1-abstract-full" style="display: none;"> In this letter, a cooperative sensing framework based on millimeter wave (mmWave) communication systems is proposed to detect tiny motions with a millimeter-level resolution. Particularly, the cooperative sensing framework is facilitated with one transmitter and two receivers. There are two radio frequency (RF) chains at each receiver. Hence, the Doppler effect due to the tiny motions can be detected via passive sensing respectively at the receivers, and the velocities of the motions can be estimated by integrating the Doppler frequencies. It is demonstrated that the proposed cooperative sensing system is able to track the handwriting with 90% error below 6 mm. Moreover, the proposed cooperative sensing is robust to the strength of received signal. For example, it works even without the line-of-sight paths from the transmitter to the receivers or the sensing target, where the received signal strength is not sufficient for timing synchronization or demodulation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.01781v1-abstract-full').style.display = 'none'; document.getElementById('2311.01781v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.05813">arXiv:2310.05813</a> <span> [<a href="https://arxiv.org/pdf/2310.05813">pdf</a>, <a href="https://arxiv.org/format/2310.05813">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Audio compression-assisted feature extraction for voice replay attack detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Shi%2C+X">Xiangyu Shi</a>, <a href="/search/eess?searchtype=author&query=Luo%2C+Y">Yuhao Luo</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+L">Li Wang</a>, <a href="/search/eess?searchtype=author&query=He%2C+H">Haorui He</a>, <a href="/search/eess?searchtype=author&query=Li%2C+H">Hao Li</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+L">Lei Wang</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+Z">Zhizheng Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.05813v2-abstract-short" style="display: inline;"> Replay attack is one of the most effective and simplest voice spoofing attacks. Detecting replay attacks is challenging, according to the Automatic Speaker Verification Spoofing and Countermeasures Challenge 2021 (ASVspoof 2021), because they involve a loudspeaker, a microphone, and acoustic conditions (e.g., background noise). One obstacle to detecting replay attacks is finding robust feature rep… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.05813v2-abstract-full').style.display = 'inline'; document.getElementById('2310.05813v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.05813v2-abstract-full" style="display: none;"> Replay attack is one of the most effective and simplest voice spoofing attacks. Detecting replay attacks is challenging, according to the Automatic Speaker Verification Spoofing and Countermeasures Challenge 2021 (ASVspoof 2021), because they involve a loudspeaker, a microphone, and acoustic conditions (e.g., background noise). One obstacle to detecting replay attacks is finding robust feature representations that reflect the channel noise information added to the replayed speech. This study proposes a feature extraction approach that uses audio compression for assistance. Audio compression compresses audio to preserve content and speaker information for transmission. The missed information after decompression is expected to contain content- and speaker-independent information (e.g., channel noise added during the replay process). We conducted a comprehensive experiment with a few data augmentation techniques and 3 classifiers on the ASVspoof 2021 physical access (PA) set and confirmed the effectiveness of the proposed feature extraction approach. To the best of our knowledge, the proposed approach achieves the lowest EER at 22.71% on the ASVspoof 2021 PA evaluation set. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.05813v2-abstract-full').style.display = 'none'; document.getElementById('2310.05813v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.05369">arXiv:2310.05369</a> <span> [<a href="https://arxiv.org/pdf/2310.05369">pdf</a>, <a href="https://arxiv.org/format/2310.05369">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> AdvSV: An Over-the-Air Adversarial Attack Dataset for Speaker Verification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+L">Li Wang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+J">Jiaqi Li</a>, <a href="/search/eess?searchtype=author&query=Luo%2C+Y">Yuhao Luo</a>, <a href="/search/eess?searchtype=author&query=Zheng%2C+J">Jiahao Zheng</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+L">Lei Wang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+H">Hao Li</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+K">Ke Xu</a>, <a href="/search/eess?searchtype=author&query=Fang%2C+C">Chengfang Fang</a>, <a href="/search/eess?searchtype=author&query=Shi%2C+J">Jie Shi</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+Z">Zhizheng Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.05369v2-abstract-short" style="display: inline;"> It is known that deep neural networks are vulnerable to adversarial attacks. Although Automatic Speaker Verification (ASV) built on top of deep neural networks exhibits robust performance in controlled scenarios, many studies confirm that ASV is vulnerable to adversarial attacks. The lack of a standard dataset is a bottleneck for further research, especially reproducible research. In this study, w… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.05369v2-abstract-full').style.display = 'inline'; document.getElementById('2310.05369v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.05369v2-abstract-full" style="display: none;"> It is known that deep neural networks are vulnerable to adversarial attacks. Although Automatic Speaker Verification (ASV) built on top of deep neural networks exhibits robust performance in controlled scenarios, many studies confirm that ASV is vulnerable to adversarial attacks. The lack of a standard dataset is a bottleneck for further research, especially reproducible research. In this study, we developed an open-source adversarial attack dataset for speaker verification research. As an initial step, we focused on the over-the-air attack. An over-the-air adversarial attack involves a perturbation generation algorithm, a loudspeaker, a microphone, and an acoustic environment. The variations in the recording configurations make it very challenging to reproduce previous research. The AdvSV dataset is constructed using the Voxceleb1 Verification test set as its foundation. This dataset employs representative ASV models subjected to adversarial attacks and records adversarial samples to simulate over-the-air attack settings. The scope of the dataset can be easily extended to include more types of adversarial attacks. The dataset will be released to the public under the CC BY-SA 4.0. In addition, we also provide a detection baseline for reproducible research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.05369v2-abstract-full').style.display = 'none'; document.getElementById('2310.05369v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICASSP2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.13905">arXiv:2309.13905</a> <span> [<a href="https://arxiv.org/pdf/2309.13905">pdf</a>, <a href="https://arxiv.org/format/2309.13905">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> AutoPrep: An Automatic Preprocessing Framework for In-the-Wild Speech Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yu%2C+J">Jianwei Yu</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+H">Hangting Chen</a>, <a href="/search/eess?searchtype=author&query=Bian%2C+Y">Yanyao Bian</a>, <a href="/search/eess?searchtype=author&query=Li%2C+X">Xiang Li</a>, <a href="/search/eess?searchtype=author&query=Luo%2C+Y">Yi Luo</a>, <a href="/search/eess?searchtype=author&query=Tian%2C+J">Jinchuan Tian</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+M">Mengyang Liu</a>, <a href="/search/eess?searchtype=author&query=Jiang%2C+J">Jiayi Jiang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+S">Shuai Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.13905v1-abstract-short" style="display: inline;"> Recently, the utilization of extensive open-sourced text data has significantly advanced the performance of text-based large language models (LLMs). However, the use of in-the-wild large-scale speech data in the speech technology community remains constrained. One reason for this limitation is that a considerable amount of the publicly available speech data is compromised by background noise, spee… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.13905v1-abstract-full').style.display = 'inline'; document.getElementById('2309.13905v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.13905v1-abstract-full" style="display: none;"> Recently, the utilization of extensive open-sourced text data has significantly advanced the performance of text-based large language models (LLMs). However, the use of in-the-wild large-scale speech data in the speech technology community remains constrained. One reason for this limitation is that a considerable amount of the publicly available speech data is compromised by background noise, speech overlapping, lack of speech segmentation information, missing speaker labels, and incomplete transcriptions, which can largely hinder their usefulness. On the other hand, human annotation of speech data is both time-consuming and costly. To address this issue, we introduce an automatic in-the-wild speech data preprocessing framework (AutoPrep) in this paper, which is designed to enhance speech quality, generate speaker labels, and produce transcriptions automatically. The proposed AutoPrep framework comprises six components: speech enhancement, speech segmentation, speaker clustering, target speech extraction, quality filtering and automatic speech recognition. Experiments conducted on the open-sourced WenetSpeech and our self-collected AutoPrepWild corpora demonstrate that the proposed AutoPrep framework can generate preprocessed data with similar DNSMOS and PDNSMOS scores compared to several open-sourced TTS datasets. The corresponding TTS system can achieve up to 0.68 in-domain speaker similarity. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.13905v1-abstract-full').style.display = 'none'; document.getElementById('2309.13905v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.06598">arXiv:2309.06598</a> <span> [<a href="https://arxiv.org/pdf/2309.06598">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Efficient Post-processing of Diffusion Tensor Cardiac Magnetic Imaging Using Texture-conserving Deformable Registration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+F">Fanwen Wang</a>, <a href="/search/eess?searchtype=author&query=Ferreira%2C+P+F">Pedro F. Ferreira</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+Y">Yinzhe Wu</a>, <a href="/search/eess?searchtype=author&query=Munoz%2C+C">Camila Munoz</a>, <a href="/search/eess?searchtype=author&query=Wen%2C+K">Ke Wen</a>, <a href="/search/eess?searchtype=author&query=Luo%2C+Y">Yaqing Luo</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+J">Jiahao Huang</a>, <a href="/search/eess?searchtype=author&query=Pennell%2C+D+J">Dudley J. Pennell</a>, <a href="/search/eess?searchtype=author&query=Scott%2C+A+D">Andrew D. Scott</a>, <a href="/search/eess?searchtype=author&query=Nielles-Vallespin%2C+S">Sonia Nielles-Vallespin</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+G">Guang Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.06598v3-abstract-short" style="display: inline;"> Diffusion tensor cardiac magnetic resonance (DT-CMR) is a method capable of providing non-invasive measurements of myocardial microstructure. Image registration is essential to correct image shifts due to intra and inter breath-hold motion and imperfect cardiac triggering. Registration is challenging in DT-CMR due to the low signal-to-noise and various contrasts induced by the diffusion encoding i… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.06598v3-abstract-full').style.display = 'inline'; document.getElementById('2309.06598v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.06598v3-abstract-full" style="display: none;"> Diffusion tensor cardiac magnetic resonance (DT-CMR) is a method capable of providing non-invasive measurements of myocardial microstructure. Image registration is essential to correct image shifts due to intra and inter breath-hold motion and imperfect cardiac triggering. Registration is challenging in DT-CMR due to the low signal-to-noise and various contrasts induced by the diffusion encoding in the myocardium and surrounding organs. Traditional deformable registration corrects through-plane motion but at the risk of destroying the texture information while rigid registration inefficiently discards frames with local deformation. In this study, we explored the possibility of deep learning-based deformable registration on DT-CMR. Based on the noise suppression using low-rank features and diffusion encoding suppression using variational auto encoder-decoder, a B-spline based registration network extracted the displacement fields and maintained the texture features of DT-CMR. In this way, our method improved the efficiency of frame utilization, manual cropping, and computational speed. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.06598v3-abstract-full').style.display = 'none'; document.getElementById('2309.06598v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages, 4 figures, conference</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.16892">arXiv:2308.16892</a> <span> [<a href="https://arxiv.org/pdf/2308.16892">pdf</a>, <a href="https://arxiv.org/format/2308.16892">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> ReZero: Region-customizable Sound Extraction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Gu%2C+R">Rongzhi Gu</a>, <a href="/search/eess?searchtype=author&query=Luo%2C+Y">Yi Luo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.16892v1-abstract-short" style="display: inline;"> We introduce region-customizable sound extraction (ReZero), a general and flexible framework for the multi-channel region-wise sound extraction (R-SE) task. R-SE task aims at extracting all active target sounds (e.g., human speech) within a specific, user-defined spatial region, which is different from conventional and existing tasks where a blind separation or a fixed, predefined spatial region a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.16892v1-abstract-full').style.display = 'inline'; document.getElementById('2308.16892v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.16892v1-abstract-full" style="display: none;"> We introduce region-customizable sound extraction (ReZero), a general and flexible framework for the multi-channel region-wise sound extraction (R-SE) task. R-SE task aims at extracting all active target sounds (e.g., human speech) within a specific, user-defined spatial region, which is different from conventional and existing tasks where a blind separation or a fixed, predefined spatial region are typically assumed. The spatial region can be defined as an angular window, a sphere, a cone, or other geometric patterns. Being a solution to the R-SE task, the proposed ReZero framework includes (1) definitions of different types of spatial regions, (2) methods for region feature extraction and aggregation, and (3) a multi-channel extension of the band-split RNN (BSRNN) model specified for the R-SE task. We design experiments for different microphone array geometries, different types of spatial regions, and comprehensive ablation studies on different system configurations. Experimental results on both simulated and real-recorded data demonstrate the effectiveness of ReZero. Demos are available at https://innerselfm.github.io/rezero/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.16892v1-abstract-full').style.display = 'none'; document.getElementById('2308.16892v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 11 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.11053">arXiv:2308.11053</a> <span> [<a href="https://arxiv.org/pdf/2308.11053">pdf</a>, <a href="https://arxiv.org/format/2308.11053">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.21437/Interspeech.2023-2302">10.21437/Interspeech.2023-2302 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Ultra Dual-Path Compression For Joint Echo Cancellation And Noise Suppression </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Chen%2C+H">Hangting Chen</a>, <a href="/search/eess?searchtype=author&query=Yu%2C+J">Jianwei Yu</a>, <a href="/search/eess?searchtype=author&query=Luo%2C+Y">Yi Luo</a>, <a href="/search/eess?searchtype=author&query=Gu%2C+R">Rongzhi Gu</a>, <a href="/search/eess?searchtype=author&query=Li%2C+W">Weihua Li</a>, <a href="/search/eess?searchtype=author&query=Lu%2C+Z">Zhuocheng Lu</a>, <a href="/search/eess?searchtype=author&query=Weng%2C+C">Chao Weng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.11053v3-abstract-short" style="display: inline;"> Echo cancellation and noise reduction are essential for full-duplex communication, yet most existing neural networks have high computational costs and are inflexible in tuning model complexity. In this paper, we introduce time-frequency dual-path compression to achieve a wide range of compression ratios on computational cost. Specifically, for frequency compression, trainable filters are used to r… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.11053v3-abstract-full').style.display = 'inline'; document.getElementById('2308.11053v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.11053v3-abstract-full" style="display: none;"> Echo cancellation and noise reduction are essential for full-duplex communication, yet most existing neural networks have high computational costs and are inflexible in tuning model complexity. In this paper, we introduce time-frequency dual-path compression to achieve a wide range of compression ratios on computational cost. Specifically, for frequency compression, trainable filters are used to replace manually designed filters for dimension reduction. For time compression, only using frame skipped prediction causes large performance degradation, which can be alleviated by a post-processing network with full sequence modeling. We have found that under fixed compression ratios, dual-path compression combining both the time and frequency methods will give further performance improvement, covering compression ratios from 4x to 32x with little model size change. Moreover, the proposed models show competitive performance compared with fast FullSubNet and DeepFilterNet. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.11053v3-abstract-full').style.display = 'none'; document.getElementById('2308.11053v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Proceedings of INTERSPEECH</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Luo%2C+Y&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Luo%2C+Y&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Luo%2C+Y&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Luo%2C+Y&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Luo%2C+Y&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository