CINXE.COM

Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1&ndash;50 of 231 results for author: <span class="mathjax">Xu, H</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> <div class="content"> <form method="GET" action="/search/eess" aria-role="search"> Searching in archive <strong>eess</strong>. <a href="/search/?searchtype=author&amp;query=Xu%2C+H">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Xu, H"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Xu%2C+H&amp;terms-0-field=author&amp;size=50&amp;order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Xu, H"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Xu%2C+H&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Xu%2C+H&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Xu%2C+H&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Xu%2C+H&amp;start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Xu%2C+H&amp;start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Xu%2C+H&amp;start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02867">arXiv:2411.02867</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.02867">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> AtlasSeg: Atlas Prior Guided Dual-U-Net for Cortical Segmentation in Fetal Brain MRI </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Xu%2C+H">Haoan Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Zheng%2C+T">Tianshu Zheng</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+X">Xinyi Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Shen%2C+Y">Yao Shen</a>, <a href="/search/eess?searchtype=author&amp;query=Sun%2C+J">Jiwei Sun</a>, <a href="/search/eess?searchtype=author&amp;query=Sun%2C+C">Cong Sun</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+G">Guangbin Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Wu%2C+D">Dan Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02867v1-abstract-short" style="display: inline;"> Accurate tissue segmentation in fetal brain MRI remains challenging due to the dynamically changing anatomical anatomy and contrast during fetal development. To enhance segmentation accuracy throughout gestation, we introduced AtlasSeg, a dual-U-shape convolution network incorporating gestational age (GA) specific information as guidance. By providing a publicly available fetal brain atlas with se&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02867v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02867v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02867v1-abstract-full" style="display: none;"> Accurate tissue segmentation in fetal brain MRI remains challenging due to the dynamically changing anatomical anatomy and contrast during fetal development. To enhance segmentation accuracy throughout gestation, we introduced AtlasSeg, a dual-U-shape convolution network incorporating gestational age (GA) specific information as guidance. By providing a publicly available fetal brain atlas with segmentation label at the corresponding GA, AtlasSeg effectively extracted the contextual features of age-specific patterns in atlas branch and generated tissue segmentation in segmentation branch. Multi-scale attentive atlas feature fusions were constructed in all stages during encoding and decoding, giving rise to a dual-U-shape network to assist feature flow and information interactions between two branches. AtlasSeg outperformed six well-known segmentation networks in both our internal fetal brain MRI dataset and the external FeTA dataset. Ablation experiments demonstrate the efficiency of atlas guidance and the attention mechanism. The proposed AtlasSeg demonstrated superior segmentation performance against other convolution networks with higher segmentation accuracy, and may facilitate fetal brain MRI analysis in large-scale fetal brain studies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02867v1-abstract-full').style.display = 'none'; document.getElementById('2411.02867v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00337">arXiv:2411.00337</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.00337">pdf</a>, <a href="https://arxiv.org/format/2411.00337">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TIA.2023.3344544">10.1109/TIA.2023.3344544 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Coherent Hierarchical Probabilistic Forecasting of Electric Vehicle Charging Demand </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Zheng%2C+K">Kedi Zheng</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+H">Hanwei Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Long%2C+Z">Zeyang Long</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+Y">Yi Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+Q">Qixin Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00337v2-abstract-short" style="display: inline;"> The growing penetration of electric vehicles (EVs) significantly changes typical load curves in smart grids. With the development of fast charging technology, the volatility of EV charging demand is increasing, which requires additional flexibility for real-time power balance. The forecasting of EV charging demand involves probabilistic modeling of high dimensional time series dynamics across dive&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00337v2-abstract-full').style.display = 'inline'; document.getElementById('2411.00337v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00337v2-abstract-full" style="display: none;"> The growing penetration of electric vehicles (EVs) significantly changes typical load curves in smart grids. With the development of fast charging technology, the volatility of EV charging demand is increasing, which requires additional flexibility for real-time power balance. The forecasting of EV charging demand involves probabilistic modeling of high dimensional time series dynamics across diverse electric vehicle charging stations (EVCSs). This paper studies the forecasting problem of multiple EVCS in a hierarchical probabilistic manner. For each charging station, a deep learning model based on a partial input convex neural network (PICNN) is trained to predict the day-ahead charging demand&#39;s conditional distribution, preventing the common quantile crossing problem in traditional quantile regression models. Then, differentiable convex optimization layers (DCLs) are used to reconcile the scenarios sampled from the distributions to yield coherent scenarios that satisfy the hierarchical constraint. It learns a better weight matrix for adjusting the forecasting results of different targets in a machine-learning approach compared to traditional optimization-based hierarchical reconciling methods. Numerical experiments based on real-world EV charging data are conducted to demonstrate the efficacy of the proposed method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00337v2-abstract-full').style.display = 'none'; document.getElementById('2411.00337v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Paper accepted for IEEE Transactions on Industrial Applications. Personal use of this material is permitted. Permission from IEEE must be obtained for all other uses</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23646">arXiv:2410.23646</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.23646">pdf</a>, <a href="https://arxiv.org/format/2410.23646">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> LiFePO4 Battery SOC Estimation under OCV-SOC Curve Error Based onAdaptive Multi-Model Kalman Filter </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Paizulamu%2C+D">Daniyaer Paizulamu</a>, <a href="/search/eess?searchtype=author&amp;query=Cheng%2C+L">Lin Cheng</a>, <a href="/search/eess?searchtype=author&amp;query=Zhuang%2C+Y">Yingrui Zhuang</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+H">Helin Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Qi%2C+N">Ning Qi</a>, <a href="/search/eess?searchtype=author&amp;query=Ci%2C+S">Song Ci</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23646v1-abstract-short" style="display: inline;"> LiFePO4 batteries are widely used in electric vehicles and energy storage systems due to long cycle life and high safety performance. However, the OCV-SOC curve (OSC) of these batteries features a long plateau region, making state of charge (SOC) estimation highly sensitive to OSC error, which arises due to aging and temperature. To address this, we propose an SOC estimation method that accounts f&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23646v1-abstract-full').style.display = 'inline'; document.getElementById('2410.23646v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23646v1-abstract-full" style="display: none;"> LiFePO4 batteries are widely used in electric vehicles and energy storage systems due to long cycle life and high safety performance. However, the OCV-SOC curve (OSC) of these batteries features a long plateau region, making state of charge (SOC) estimation highly sensitive to OSC error, which arises due to aging and temperature. To address this, we propose an SOC estimation method that accounts for error in OSC. First, we establish battery equivalent circuit model (ECM) and introduce a parameters identification algorithm based on adaptive recursive least squares. Next, we derive the relationship between the innovation&#39;s cross-correlation matrix (CCM)/ auto-correlation matrix (ACM) of the Kalman filter and the OSC error. We then develop an adaptive multi-model Kalman filter (AMMKF), which dynamically adjusts the measurement model parameters of each filter based on the sign of the OSC error. By assigning a probability to each filter according to its predicted voltage distribution function, the optimal filter is selected. The proposed method is tested under various OSC error types and operating conditions. Results demonstrate that the proposed method achieves high accuracy and robustness, with an RMSE of less than 3\%, which is more than 10\% lower than the estimation error of traditional method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23646v1-abstract-full').style.display = 'none'; document.getElementById('2410.23646v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages,9 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18089">arXiv:2410.18089</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.18089">pdf</a>, <a href="https://arxiv.org/format/2410.18089">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Empowering Cognitive Digital Twins with Generative Foundation Models: Developing a Low-Carbon Integrated Freight Transportation System </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Li%2C+X">Xueping Li</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+H">Haowen Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Tupayachi%2C+J">Jose Tupayachi</a>, <a href="/search/eess?searchtype=author&amp;query=Omitaomu%2C+O">Olufemi Omitaomu</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+X">Xudong Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18089v1-abstract-short" style="display: inline;"> Effective monitoring of freight transportation is essential for advancing sustainable, low-carbon economies. Traditional methods relying on single-modal data and discrete simulations fall short in optimizing intermodal systems holistically. These systems involve interconnected processes that affect shipping time, costs, emissions, and socio-economic factors. Developing digital twins for real-time&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18089v1-abstract-full').style.display = 'inline'; document.getElementById('2410.18089v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18089v1-abstract-full" style="display: none;"> Effective monitoring of freight transportation is essential for advancing sustainable, low-carbon economies. Traditional methods relying on single-modal data and discrete simulations fall short in optimizing intermodal systems holistically. These systems involve interconnected processes that affect shipping time, costs, emissions, and socio-economic factors. Developing digital twins for real-time awareness, predictive analytics, and urban logistics optimization requires extensive efforts in knowledge discovery, data integration, and multi-domain simulation. Recent advancements in generative AI offer new opportunities to streamline digital twin development by automating knowledge discovery and data integration, generating innovative simulation and optimization solutions. These models extend digital twins&#39; capabilities by promoting autonomous workflows for data engineering, analytics, and software development. This paper proposes an innovative paradigm that leverages generative AI to enhance digital twins for urban research and operations. Using freight decarbonization as a case study, we propose a conceptual framework employing transformer-based language models to enhance an urban digital twin through foundation models. We share preliminary results and our vision for more intelligent, autonomous, and general-purpose digital twins for optimizing integrated freight systems from multimodal to synchromodal paradigms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18089v1-abstract-full').style.display = 'none'; document.getElementById('2410.18089v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.06115">arXiv:2410.06115</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.06115">pdf</a>, <a href="https://arxiv.org/format/2410.06115">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> A physics-based perspective for understanding and utilizing spatial resources of wireless channels </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Xu%2C+H">Hui Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Wu%2C+J+W">Jun Wei Wu</a>, <a href="/search/eess?searchtype=author&amp;query=Qi%2C+Z+J">Zhen Jie Qi</a>, <a href="/search/eess?searchtype=author&amp;query=Wu%2C+H+T">Hao Tian Wu</a>, <a href="/search/eess?searchtype=author&amp;query=Shao%2C+R+W">Rui Wen Shao</a>, <a href="/search/eess?searchtype=author&amp;query=Cheng%2C+Q">Qiang Cheng</a>, <a href="/search/eess?searchtype=author&amp;query=Zhu%2C+J">Jieao Zhu</a>, <a href="/search/eess?searchtype=author&amp;query=Dai%2C+L">Linglong Dai</a>, <a href="/search/eess?searchtype=author&amp;query=Cui%2C+T+J">Tie Jun Cui</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.06115v1-abstract-short" style="display: inline;"> To satisfy the increasing demands for transmission rates of wireless communications, it is necessary to use spatial resources of electromagnetic (EM) waves. In this context, EM information theory (EIT) has become a hot topic by integrating the theoretical framework of deterministic mathematics and stochastic statistics to explore the transmission mechanisms of continuous EM waves. However, the pre&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06115v1-abstract-full').style.display = 'inline'; document.getElementById('2410.06115v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.06115v1-abstract-full" style="display: none;"> To satisfy the increasing demands for transmission rates of wireless communications, it is necessary to use spatial resources of electromagnetic (EM) waves. In this context, EM information theory (EIT) has become a hot topic by integrating the theoretical framework of deterministic mathematics and stochastic statistics to explore the transmission mechanisms of continuous EM waves. However, the previous studies were primarily focused on frame analysis, with limited exploration of practical applications and a comprehensive understanding of its essential physical characteristics. In this paper, we present a three-dimensional (3-D) line-of-sight channel capacity formula that captures the vector EM physics and accommodates both near- and far-field scenes. Based on the rigorous mathematical equation and the physical mechanism of fast multipole expansion, a channel model is established, and the finite angular spectral bandwidth feature of scattered waves is revealed. To adapt to the feature of the channel, an optimization problem is formulated for determining the mode currents on the transmitter, aiming to obtain the optimal design of the precoder and combiner. We make comprehensive analyses to investigate the relationship among the spatial degree of freedom, noise, and transmitted power, thereby establishing a rigorous upper bound of channel capacity. A series of simulations are conducted to validate the theoretical model and numerical method. This work offers a novel perspective and methodology for understanding and leveraging EIT, and provides a theoretical foundation for the design and optimization of future wireless communications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06115v1-abstract-full').style.display = 'none'; document.getElementById('2410.06115v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">31pages, 8 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.18255">arXiv:2409.18255</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.18255">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Sensitivity of quantitative diffusion MRI tractography and microstructure to anisotropic spatial sampling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=McMaster%2C+E+M">Elyssa M. McMaster</a>, <a href="/search/eess?searchtype=author&amp;query=Newlin%2C+N+R">Nancy R. Newlin</a>, <a href="/search/eess?searchtype=author&amp;query=Cho%2C+C">Chloe Cho</a>, <a href="/search/eess?searchtype=author&amp;query=Rudravaram%2C+G">Gaurav Rudravaram</a>, <a href="/search/eess?searchtype=author&amp;query=Saunders%2C+A+M">Adam M. Saunders</a>, <a href="/search/eess?searchtype=author&amp;query=Krishnan%2C+A+R">Aravind R. Krishnan</a>, <a href="/search/eess?searchtype=author&amp;query=Remedios%2C+L+W">Lucas W. Remedios</a>, <a href="/search/eess?searchtype=author&amp;query=Kim%2C+M+E">Michael E. Kim</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+H">Hanliang Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Schilling%2C+K+G">Kurt G. Schilling</a>, <a href="/search/eess?searchtype=author&amp;query=Rheault%2C+F">Fran莽ois Rheault</a>, <a href="/search/eess?searchtype=author&amp;query=Cutting%2C+L+E">Laurie E. Cutting</a>, <a href="/search/eess?searchtype=author&amp;query=Landman%2C+B+A">Bennett A. Landman</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.18255v1-abstract-short" style="display: inline;"> Purpose: Diffusion weighted MRI (dMRI) and its models of neural structure provide insight into human brain organization and variations in white matter. A recent study by McMaster, et al. showed that complex graph measures of the connectome, the graphical representation of a tractogram, vary with spatial sampling changes, but biases introduced by anisotropic voxels in the process have not been well&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18255v1-abstract-full').style.display = 'inline'; document.getElementById('2409.18255v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.18255v1-abstract-full" style="display: none;"> Purpose: Diffusion weighted MRI (dMRI) and its models of neural structure provide insight into human brain organization and variations in white matter. A recent study by McMaster, et al. showed that complex graph measures of the connectome, the graphical representation of a tractogram, vary with spatial sampling changes, but biases introduced by anisotropic voxels in the process have not been well characterized. This study uses microstructural measures (fractional anisotropy and mean diffusivity) and white matter bundle properties (bundle volume, length, and surface area) to further understand the effect of anisotropic voxels on microstructure and tractography. Methods: The statistical significance of the selected measures derived from dMRI data were assessed by comparing three white matter bundles at different spatial resolutions with 44 subjects from the Human Connectome Project Young Adult dataset scan/rescan data using the Wilcoxon Signed Rank test. The original isotropic resolution (1.25 mm isotropic) was explored with six anisotropic resolutions with 0.25 mm incremental steps in the z dimension. Then, all generated resolutions were upsampled to 1.25 mm isotropic and 1 mm isotropic. Results: There were statistically significant differences between at least one microstructural and one bundle measure at every resolution (p less than or equal to 0.05, corrected for multiple comparisons). Cohen&#39;s d coefficient evaluated the effect size of anisotropic voxels on microstructure and tractography. Conclusion: Fractional anisotropy and mean diffusivity cannot be recovered with basic up sampling from low quality data with gold standard data. However, the bundle measures from tractogram become more repeatable when voxels are resampled to 1 mm isotropic. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18255v1-abstract-full').style.display = 'none'; document.getElementById('2409.18255v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.14090">arXiv:2409.14090</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.14090">pdf</a>, <a href="https://arxiv.org/format/2409.14090">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Window-based Channel Attention for Wavelet-enhanced Learned Image Compression </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Xu%2C+H">Heng Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Hai%2C+B">Bowen Hai</a>, <a href="/search/eess?searchtype=author&amp;query=Tang%2C+Y">Yushun Tang</a>, <a href="/search/eess?searchtype=author&amp;query=He%2C+Z">Zhihai He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.14090v4-abstract-short" style="display: inline;"> Learned Image Compression (LIC) models have achieved superior rate-distortion performance than traditional codecs. Existing LIC models use CNN, Transformer, or Mixed CNN-Transformer as basic blocks. However, limited by the shifted window attention, Swin-Transformer-based LIC exhibits a restricted growth of receptive fields, affecting the ability to model large objects for image compression. To add&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14090v4-abstract-full').style.display = 'inline'; document.getElementById('2409.14090v4-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.14090v4-abstract-full" style="display: none;"> Learned Image Compression (LIC) models have achieved superior rate-distortion performance than traditional codecs. Existing LIC models use CNN, Transformer, or Mixed CNN-Transformer as basic blocks. However, limited by the shifted window attention, Swin-Transformer-based LIC exhibits a restricted growth of receptive fields, affecting the ability to model large objects for image compression. To address this issue and improve the performance, we incorporate window partition into channel attention for the first time to obtain large receptive fields and capture more global information. Since channel attention hinders local information learning, it is important to extend existing attention mechanisms in Transformer codecs to the space-channel attention to establish multiple receptive fields, being able to capture global correlations with large receptive fields while maintaining detailed characterization of local correlations with small receptive fields. We also incorporate the discrete wavelet transform into our Spatial-Channel Hybrid (SCH) framework for efficient frequency-dependent down-sampling and further enlarging receptive fields. Experiment results demonstrate that our method achieves state-of-the-art performances, reducing BD-rate by 18.54%, 23.98%, 22.33%, and 24.71% on four standard datasets compared to VTM-23.1. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14090v4-abstract-full').style.display = 'none'; document.getElementById('2409.14090v4-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ACCV2024 accepted; camera-ready version</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.11214">arXiv:2409.11214</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.11214">pdf</a>, <a href="https://arxiv.org/format/2409.11214">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Ideal-LLM: Integrating Dual Encoders and Language-Adapted LLM for Multilingual Speech-to-Text </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Xue%2C+H">Hongfei Xue</a>, <a href="/search/eess?searchtype=author&amp;query=Ren%2C+W">Wei Ren</a>, <a href="/search/eess?searchtype=author&amp;query=Geng%2C+X">Xuelong Geng</a>, <a href="/search/eess?searchtype=author&amp;query=Wei%2C+K">Kun Wei</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+L">Longhao Li</a>, <a href="/search/eess?searchtype=author&amp;query=Shao%2C+Q">Qijie Shao</a>, <a href="/search/eess?searchtype=author&amp;query=Yang%2C+L">Linju Yang</a>, <a href="/search/eess?searchtype=author&amp;query=Diao%2C+K">Kai Diao</a>, <a href="/search/eess?searchtype=author&amp;query=Xie%2C+L">Lei Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.11214v1-abstract-short" style="display: inline;"> Integrating audio encoders with LLMs through connectors has enabled these models to process and comprehend audio modalities, significantly enhancing speech-to-text tasks, including automatic speech recognition (ASR) and automatic speech translation (AST). However, these methods often overlook the critical aspect of language adaptation in multilingual settings, relying instead on multilingual data&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.11214v1-abstract-full').style.display = 'inline'; document.getElementById('2409.11214v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.11214v1-abstract-full" style="display: none;"> Integrating audio encoders with LLMs through connectors has enabled these models to process and comprehend audio modalities, significantly enhancing speech-to-text tasks, including automatic speech recognition (ASR) and automatic speech translation (AST). However, these methods often overlook the critical aspect of language adaptation in multilingual settings, relying instead on multilingual data without adequately addressing language differences. To address this gap, we propose the Ideal-LLM model, which employs dual multilingual encoders to enrich language feature information and utilizes a language-adapted connector to target the adaptation of each language specifically. By leveraging the complementary strengths of Whisper and MMS encoders, our approach ensures richer multilingual representations. Additionally, the language-adapted connector enhances modal transformation via a language weight selector tailored for each language. Experimental results demonstrate that Ideal-LLM significantly improves ASR performance, achieving a 32.6% relative reduction in average word error rates compared to the standard speech encoder integrated with LLMs and yields an average BLEU score of 36.78 for AST task. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.11214v1-abstract-full').style.display = 'none'; document.getElementById('2409.11214v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 3 figures, submitted to ICASSP 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.05601">arXiv:2409.05601</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.05601">pdf</a>, <a href="https://arxiv.org/format/2409.05601">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Longer is (Not Necessarily) Stronger: Punctuated Long-Sequence Training for Enhanced Speech Recognition and Translation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Koluguri%2C+N+R">Nithin Rao Koluguri</a>, <a href="/search/eess?searchtype=author&amp;query=Bartley%2C+T">Travis Bartley</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+H">Hainan Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Hrinchuk%2C+O">Oleksii Hrinchuk</a>, <a href="/search/eess?searchtype=author&amp;query=Balam%2C+J">Jagadeesh Balam</a>, <a href="/search/eess?searchtype=author&amp;query=Ginsburg%2C+B">Boris Ginsburg</a>, <a href="/search/eess?searchtype=author&amp;query=Kucsko%2C+G">Georg Kucsko</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.05601v1-abstract-short" style="display: inline;"> This paper presents a new method for training sequence-to-sequence models for speech recognition and translation tasks. Instead of the traditional approach of training models on short segments containing only lowercase or partial punctuation and capitalization (PnC) sentences, we propose training on longer utterances that include complete sentences with proper punctuation and capitalization. We ac&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05601v1-abstract-full').style.display = 'inline'; document.getElementById('2409.05601v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.05601v1-abstract-full" style="display: none;"> This paper presents a new method for training sequence-to-sequence models for speech recognition and translation tasks. Instead of the traditional approach of training models on short segments containing only lowercase or partial punctuation and capitalization (PnC) sentences, we propose training on longer utterances that include complete sentences with proper punctuation and capitalization. We achieve this by using the FastConformer architecture which allows training 1 Billion parameter models with sequences up to 60 seconds long with full attention. However, while training with PnC enhances the overall performance, we observed that accuracy plateaus when training on sequences longer than 40 seconds across various evaluation settings. Our proposed method significantly improves punctuation and capitalization accuracy, showing a 25% relative word error rate (WER) improvement on the Earnings-21 and Earnings-22 benchmarks. Additionally, training on longer audio segments increases the overall model accuracy across speech recognition and translation benchmarks. The model weights and training code are open-sourced though NVIDIA NeMo. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05601v1-abstract-full').style.display = 'none'; document.getElementById('2409.05601v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at SLT 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.05430">arXiv:2409.05430</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.05430">pdf</a>, <a href="https://arxiv.org/format/2409.05430">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Findings of the 2024 Mandarin Stuttering Event Detection and Automatic Speech Recognition Challenge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Xue%2C+H">Hongfei Xue</a>, <a href="/search/eess?searchtype=author&amp;query=Gong%2C+R">Rong Gong</a>, <a href="/search/eess?searchtype=author&amp;query=Shao%2C+M">Mingchen Shao</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+X">Xin Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+L">Lezhi Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Xie%2C+L">Lei Xie</a>, <a href="/search/eess?searchtype=author&amp;query=Bu%2C+H">Hui Bu</a>, <a href="/search/eess?searchtype=author&amp;query=Zhou%2C+J">Jiaming Zhou</a>, <a href="/search/eess?searchtype=author&amp;query=Qin%2C+Y">Yong Qin</a>, <a href="/search/eess?searchtype=author&amp;query=Du%2C+J">Jun Du</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+M">Ming Li</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+B">Binbin Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Jia%2C+B">Bin Jia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.05430v1-abstract-short" style="display: inline;"> The StutteringSpeech Challenge focuses on advancing speech technologies for people who stutter, specifically targeting Stuttering Event Detection (SED) and Automatic Speech Recognition (ASR) in Mandarin. The challenge comprises three tracks: (1) SED, which aims to develop systems for detection of stuttering events; (2) ASR, which focuses on creating robust systems for recognizing stuttered speech;&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05430v1-abstract-full').style.display = 'inline'; document.getElementById('2409.05430v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.05430v1-abstract-full" style="display: none;"> The StutteringSpeech Challenge focuses on advancing speech technologies for people who stutter, specifically targeting Stuttering Event Detection (SED) and Automatic Speech Recognition (ASR) in Mandarin. The challenge comprises three tracks: (1) SED, which aims to develop systems for detection of stuttering events; (2) ASR, which focuses on creating robust systems for recognizing stuttered speech; and (3) Research track for innovative approaches utilizing the provided dataset. We utilizes an open-source Mandarin stuttering dataset AS-70, which has been split into new training and test sets for the challenge. This paper presents the dataset, details the challenge tracks, and analyzes the performance of the top systems, highlighting improvements in detection accuracy and reductions in recognition error rates. Our findings underscore the potential of specialized models and augmentation strategies in developing stuttered speech technologies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05430v1-abstract-full').style.display = 'none'; document.getElementById('2409.05430v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 2 figures, accepted by SLT 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.02041">arXiv:2409.02041</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.02041">pdf</a>, <a href="https://arxiv.org/format/2409.02041">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> The USTC-NERCSLIP Systems for the CHiME-8 NOTSOFAR-1 Challenge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Niu%2C+S">Shutong Niu</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+R">Ruoyu Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Du%2C+J">Jun Du</a>, <a href="/search/eess?searchtype=author&amp;query=Yang%2C+G">Gaobin Yang</a>, <a href="/search/eess?searchtype=author&amp;query=Tu%2C+Y">Yanhui Tu</a>, <a href="/search/eess?searchtype=author&amp;query=Wu%2C+S">Siyuan Wu</a>, <a href="/search/eess?searchtype=author&amp;query=Qian%2C+S">Shuangqing Qian</a>, <a href="/search/eess?searchtype=author&amp;query=Wu%2C+H">Huaxin Wu</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+H">Haitao Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+X">Xueyang Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Zhong%2C+G">Guolong Zhong</a>, <a href="/search/eess?searchtype=author&amp;query=Yu%2C+X">Xindi Yu</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+J">Jieru Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+M">Mengzhi Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Cai%2C+D">Di Cai</a>, <a href="/search/eess?searchtype=author&amp;query=Gao%2C+T">Tian Gao</a>, <a href="/search/eess?searchtype=author&amp;query=Wan%2C+G">Genshun Wan</a>, <a href="/search/eess?searchtype=author&amp;query=Ma%2C+F">Feng Ma</a>, <a href="/search/eess?searchtype=author&amp;query=Pan%2C+J">Jia Pan</a>, <a href="/search/eess?searchtype=author&amp;query=Gao%2C+J">Jianqing Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.02041v2-abstract-short" style="display: inline;"> This technical report outlines our submission system for the CHiME-8 NOTSOFAR-1 Challenge. The primary difficulty of this challenge is the dataset recorded across various conference rooms, which captures real-world complexities such as high overlap rates, background noises, a variable number of speakers, and natural conversation styles. To address these issues, we optimized the system in several a&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.02041v2-abstract-full').style.display = 'inline'; document.getElementById('2409.02041v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.02041v2-abstract-full" style="display: none;"> This technical report outlines our submission system for the CHiME-8 NOTSOFAR-1 Challenge. The primary difficulty of this challenge is the dataset recorded across various conference rooms, which captures real-world complexities such as high overlap rates, background noises, a variable number of speakers, and natural conversation styles. To address these issues, we optimized the system in several aspects: For front-end speech signal processing, we introduced a data-driven joint training method for diarization and separation (JDS) to enhance audio quality. Additionally, we also integrated traditional guided source separation (GSS) for multi-channel track to provide complementary information for the JDS. For back-end speech recognition, we enhanced Whisper with WavLM, ConvNeXt, and Transformer innovations, applying multi-task training and Noise KLD augmentation, to significantly advance ASR robustness and accuracy. Our system attained a Time-Constrained minimum Permutation Word Error Rate (tcpWER) of 14.265% and 22.989% on the CHiME-8 NOTSOFAR-1 Dev-set-2 multi-channel and single-channel tracks, respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.02041v2-abstract-full').style.display = 'none'; document.getElementById('2409.02041v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.14969">arXiv:2408.14969</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.14969">pdf</a>, <a href="https://arxiv.org/format/2408.14969">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Secrecy Performance Analysis of RIS-Aided Fluid Antenna Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Ghadi%2C+F+R">Farshad Rostami Ghadi</a>, <a href="/search/eess?searchtype=author&amp;query=Wong%2C+K">Kai-Kit Wong</a>, <a href="/search/eess?searchtype=author&amp;query=Kaveh%2C+M">Masoud Kaveh</a>, <a href="/search/eess?searchtype=author&amp;query=Lopez-Martinez%2C+F+J">F. Javier Lopez-Martinez</a>, <a href="/search/eess?searchtype=author&amp;query=New%2C+W+K">Wee Kiat New</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+H">Hao Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.14969v1-abstract-short" style="display: inline;"> This paper examines the impact of emerging fluid antenna systems (FAS) on reconfigurable intelligent surface (RIS)-aided secure communications. Specifically, we consider a classic wiretap channel, where a fixed-antenna transmitter sends confidential information to an FAS-equipped legitimate user with the help of an RIS, while an FAS-equipped eavesdropper attempts to decode the message. To evaluate&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.14969v1-abstract-full').style.display = 'inline'; document.getElementById('2408.14969v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.14969v1-abstract-full" style="display: none;"> This paper examines the impact of emerging fluid antenna systems (FAS) on reconfigurable intelligent surface (RIS)-aided secure communications. Specifically, we consider a classic wiretap channel, where a fixed-antenna transmitter sends confidential information to an FAS-equipped legitimate user with the help of an RIS, while an FAS-equipped eavesdropper attempts to decode the message. To evaluate the proposed wireless scenario, we first introduce the cumulative distribution function (CDF) and probability density function (PDF) of the signal-to-noise ratio (SNR) at each node, using the central limit theorem and the Gaussian copula function. We then derive a compact analytical expression for the secrecy outage probability (SOP). Our numerical results reveal how the incorporation of FAS and RIS can significantly enhance the performance of secure communications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.14969v1-abstract-full').style.display = 'none'; document.getElementById('2408.14969v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.11227">arXiv:2408.11227</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.11227">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> OCTCube: A 3D foundation model for optical coherence tomography that improves cross-dataset, cross-disease, cross-device and cross-modality analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Liu%2C+Z">Zixuan Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+H">Hanwen Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Woicik%2C+A">Addie Woicik</a>, <a href="/search/eess?searchtype=author&amp;query=Shapiro%2C+L+G">Linda G. Shapiro</a>, <a href="/search/eess?searchtype=author&amp;query=Blazes%2C+M">Marian Blazes</a>, <a href="/search/eess?searchtype=author&amp;query=Wu%2C+Y">Yue Wu</a>, <a href="/search/eess?searchtype=author&amp;query=Lee%2C+C+S">Cecilia S. Lee</a>, <a href="/search/eess?searchtype=author&amp;query=Lee%2C+A+Y">Aaron Y. Lee</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+S">Sheng Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.11227v1-abstract-short" style="display: inline;"> Optical coherence tomography (OCT) has become critical for diagnosing retinal diseases as it enables 3D images of the retina and optic nerve. OCT acquisition is fast, non-invasive, affordable, and scalable. Due to its broad applicability, massive numbers of OCT images have been accumulated in routine exams, making it possible to train large-scale foundation models that can generalize to various di&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.11227v1-abstract-full').style.display = 'inline'; document.getElementById('2408.11227v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.11227v1-abstract-full" style="display: none;"> Optical coherence tomography (OCT) has become critical for diagnosing retinal diseases as it enables 3D images of the retina and optic nerve. OCT acquisition is fast, non-invasive, affordable, and scalable. Due to its broad applicability, massive numbers of OCT images have been accumulated in routine exams, making it possible to train large-scale foundation models that can generalize to various diagnostic tasks using OCT images. Nevertheless, existing foundation models for OCT only consider 2D image slices, overlooking the rich 3D structure. Here, we present OCTCube, a 3D foundation model pre-trained on 26,605 3D OCT volumes encompassing 1.62 million 2D OCT images. OCTCube is developed based on 3D masked autoencoders and exploits FlashAttention to reduce the larger GPU memory usage caused by modeling 3D volumes. OCTCube outperforms 2D models when predicting 8 retinal diseases in both inductive and cross-dataset settings, indicating that utilizing the 3D structure in the model instead of 2D data results in significant improvement. OCTCube further shows superior performance on cross-device prediction and when predicting systemic diseases, such as diabetes and hypertension, further demonstrating its strong generalizability. Finally, we propose a contrastive-self-supervised-learning-based OCT-IR pre-training framework (COIP) for cross-modality analysis on OCT and infrared retinal (IR) images, where the OCT volumes are embedded using OCTCube. We demonstrate that COIP enables accurate alignment between OCT and IR en face images. Collectively, OCTCube, a 3D OCT foundation model, demonstrates significantly better performance against 2D models on 27 out of 29 tasks and comparable performance on the other two tasks, paving the way for AI-based retinal disease diagnosis. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.11227v1-abstract-full').style.display = 'none'; document.getElementById('2408.11227v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.10680">arXiv:2408.10680</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.10680">pdf</a>, <a href="https://arxiv.org/format/2408.10680">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Towards Rehearsal-Free Multilingual ASR: A LoRA-based Case Study on Whisper </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Xu%2C+T">Tianyi Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Huang%2C+K">Kaixun Huang</a>, <a href="/search/eess?searchtype=author&amp;query=Guo%2C+P">Pengcheng Guo</a>, <a href="/search/eess?searchtype=author&amp;query=Zhou%2C+Y">Yu Zhou</a>, <a href="/search/eess?searchtype=author&amp;query=Huang%2C+L">Longtao Huang</a>, <a href="/search/eess?searchtype=author&amp;query=Xue%2C+H">Hui Xue</a>, <a href="/search/eess?searchtype=author&amp;query=Xie%2C+L">Lei Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.10680v1-abstract-short" style="display: inline;"> Pre-trained multilingual speech foundation models, like Whisper, have shown impressive performance across different languages. However, adapting these models to new or specific languages is computationally extensive and faces catastrophic forgetting problems. Addressing these issues, our study investigates strategies to enhance the model on new languages in the absence of original training data, w&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10680v1-abstract-full').style.display = 'inline'; document.getElementById('2408.10680v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.10680v1-abstract-full" style="display: none;"> Pre-trained multilingual speech foundation models, like Whisper, have shown impressive performance across different languages. However, adapting these models to new or specific languages is computationally extensive and faces catastrophic forgetting problems. Addressing these issues, our study investigates strategies to enhance the model on new languages in the absence of original training data, while also preserving the established performance on the original languages. Specifically, we first compare various LoRA-based methods to find out their vulnerability to forgetting. To mitigate this issue, we propose to leverage the LoRA parameters from the original model for approximate orthogonal gradient descent on the new samples. Additionally, we also introduce a learnable rank coefficient to allocate trainable parameters for more efficient training. Our experiments with a Chinese Whisper model (for Uyghur and Tibetan) yield better results with a more compact parameter set. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10680v1-abstract-full').style.display = 'none'; document.getElementById('2408.10680v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.10096">arXiv:2408.10096</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.10096">pdf</a>, <a href="https://arxiv.org/format/2408.10096">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3664647.3681539">10.1145/3664647.3681539 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Convert and Speak: Zero-shot Accent Conversion with Minimum Supervision </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Jia%2C+Z">Zhijun Jia</a>, <a href="/search/eess?searchtype=author&amp;query=Xue%2C+H">Huaying Xue</a>, <a href="/search/eess?searchtype=author&amp;query=Peng%2C+X">Xiulian Peng</a>, <a href="/search/eess?searchtype=author&amp;query=Lu%2C+Y">Yan Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.10096v2-abstract-short" style="display: inline;"> Low resource of parallel data is the key challenge of accent conversion(AC) problem in which both the pronunciation units and prosody pattern need to be converted. We propose a two-stage generative framework &#34;convert-and-speak&#34; in which the conversion is only operated on the semantic token level and the speech is synthesized conditioned on the converted semantic token with a speech generative mode&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10096v2-abstract-full').style.display = 'inline'; document.getElementById('2408.10096v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.10096v2-abstract-full" style="display: none;"> Low resource of parallel data is the key challenge of accent conversion(AC) problem in which both the pronunciation units and prosody pattern need to be converted. We propose a two-stage generative framework &#34;convert-and-speak&#34; in which the conversion is only operated on the semantic token level and the speech is synthesized conditioned on the converted semantic token with a speech generative model in target accent domain. The decoupling design enables the &#34;speaking&#34; module to use massive amount of target accent speech and relieves the parallel data required for the &#34;conversion&#34; module. Conversion with the bridge of semantic token also relieves the requirement for the data with text transcriptions and unlocks the usage of language pre-training technology to further efficiently reduce the need of parallel accent speech data. To reduce the complexity and latency of &#34;speaking&#34;, a single-stage AR generative model is designed to achieve good quality as well as lower computation cost. Experiments on Indian-English to general American-English conversion show that the proposed framework achieves state-of-the-art performance in accent similarity, speech quality, and speaker maintenance with only 15 minutes of weakly parallel data which is not constrained to the same speaker. Extensive experimentation with diverse accent types suggests that this framework possesses a high degree of adaptability, making it readily scalable to accommodate other accents with low-resource data. Audio samples are available at https://www.microsoft.com/en-us/research/project/convert-and-speak-zero-shot-accent-conversion-with-minimumsupervision/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10096v2-abstract-full').style.display = 'none'; document.getElementById('2408.10096v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, ACM MM2024(accepted)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.07532">arXiv:2408.07532</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.07532">pdf</a>, <a href="https://arxiv.org/format/2408.07532">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Improved 3D Whole Heart Geometry from Sparse CMR Slices </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Xu%2C+Y">Yiyang Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+H">Hao Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Sinclair%2C+M">Matthew Sinclair</a>, <a href="/search/eess?searchtype=author&amp;query=Puyol-Ant%C3%B3n%2C+E">Esther Puyol-Ant贸n</a>, <a href="/search/eess?searchtype=author&amp;query=Niederer%2C+S+A">Steven A Niederer</a>, <a href="/search/eess?searchtype=author&amp;query=Chiribiri%2C+A">Amedeo Chiribiri</a>, <a href="/search/eess?searchtype=author&amp;query=Williams%2C+S+E">Steven E Williams</a>, <a href="/search/eess?searchtype=author&amp;query=Williams%2C+M+C">Michelle C Williams</a>, <a href="/search/eess?searchtype=author&amp;query=Young%2C+A+A">Alistair A Young</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.07532v1-abstract-short" style="display: inline;"> Cardiac magnetic resonance (CMR) imaging and computed tomography (CT) are two common non-invasive imaging methods for assessing patients with cardiovascular disease. CMR typically acquires multiple sparse 2D slices, with unavoidable respiratory motion artefacts between slices, whereas CT acquires isotropic dense data but uses ionising radiation. In this study, we explore the combination of Slice S&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.07532v1-abstract-full').style.display = 'inline'; document.getElementById('2408.07532v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.07532v1-abstract-full" style="display: none;"> Cardiac magnetic resonance (CMR) imaging and computed tomography (CT) are two common non-invasive imaging methods for assessing patients with cardiovascular disease. CMR typically acquires multiple sparse 2D slices, with unavoidable respiratory motion artefacts between slices, whereas CT acquires isotropic dense data but uses ionising radiation. In this study, we explore the combination of Slice Shifting Algorithm (SSA), Spatial Transformer Network (STN), and Label Transformer Network (LTN) to: 1) correct respiratory motion between segmented slices, and 2) transform sparse segmentation data into dense segmentation. All combinations were validated using synthetic motion-corrupted CMR slice segmentation generated from CT in 1699 cases, where the dense CT serves as the ground truth. In 199 testing cases, SSA-LTN achieved the best results for Dice score and Huasdorff distance (94.0% and 4.7 mm respectively, average over 5 labels) but gave topological errors in 8 cases. STN was effective as a plug-in tool for correcting all topological errors with minimal impact on overall performance (93.5% and 5.0 mm respectively). SSA also proves to be a valuable plug-in tool, enhancing performance over both STN-based and LTN-based models. The code for these different combinations is available at https://github.com/XESchong/STACOM2024. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.07532v1-abstract-full').style.display = 'none'; document.getElementById('2408.07532v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, STACOM2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.02966">arXiv:2408.02966</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.02966">pdf</a>, <a href="https://arxiv.org/format/2408.02966">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Fast Point Cloud Geometry Compression with Context-based Residual Coding and INR-based Refinement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Xu%2C+H">Hao Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+X">Xi Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Wu%2C+X">Xiaolin Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.02966v1-abstract-short" style="display: inline;"> Compressing a set of unordered points is far more challenging than compressing images/videos of regular sample grids, because of the difficulties in characterizing neighboring relations in an irregular layout of points. Many researchers resort to voxelization to introduce regularity, but this approach suffers from quantization loss. In this research, we use the KNN method to determine the neighbor&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.02966v1-abstract-full').style.display = 'inline'; document.getElementById('2408.02966v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.02966v1-abstract-full" style="display: none;"> Compressing a set of unordered points is far more challenging than compressing images/videos of regular sample grids, because of the difficulties in characterizing neighboring relations in an irregular layout of points. Many researchers resort to voxelization to introduce regularity, but this approach suffers from quantization loss. In this research, we use the KNN method to determine the neighborhoods of raw surface points. This gives us a means to determine the spatial context in which the latent features of 3D points are compressed by arithmetic coding. As such, the conditional probability model is adaptive to local geometry, leading to significant rate reduction. Additionally, we propose a dual-layer architecture where a non-learning base layer reconstructs the main structures of the point cloud at low complexity, while a learned refinement layer focuses on preserving fine details. This design leads to reductions in model complexity and coding latency by two orders of magnitude compared to SOTA methods. Moreover, we incorporate an implicit neural representation (INR) into the refinement layer, allowing the decoder to sample points on the underlying surface at arbitrary densities. This work is the first to effectively exploit content-aware local contexts for compressing irregular raw point clouds, achieving high rate-distortion performance, low complexity, and the ability to function as an arbitrary-scale upsampling network simultaneously. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.02966v1-abstract-full').style.display = 'none'; document.getElementById('2408.02966v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ECCV 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.01351">arXiv:2408.01351</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.01351">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Medical Physics">physics.med-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Harmonized connectome resampling for variance in voxel sizes </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=McMaster%2C+E+M">Elyssa M. McMaster</a>, <a href="/search/eess?searchtype=author&amp;query=Newlin%2C+N+R">Nancy R. Newlin</a>, <a href="/search/eess?searchtype=author&amp;query=Rudravaram%2C+G">Gaurav Rudravaram</a>, <a href="/search/eess?searchtype=author&amp;query=Saunders%2C+A+M">Adam M. Saunders</a>, <a href="/search/eess?searchtype=author&amp;query=Krishnan%2C+A+R">Aravind R. Krishnan</a>, <a href="/search/eess?searchtype=author&amp;query=Remedios%2C+L+W">Lucas W. Remedios</a>, <a href="/search/eess?searchtype=author&amp;query=Kim%2C+M+E">Michael E. Kim</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+H">Hanliang Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Archer%2C+D+B">Derek B. Archer</a>, <a href="/search/eess?searchtype=author&amp;query=Schilling%2C+K+G">Kurt G. Schilling</a>, <a href="/search/eess?searchtype=author&amp;query=Rheault%2C+F">Fran莽ois Rheault</a>, <a href="/search/eess?searchtype=author&amp;query=Cutting%2C+L+E">Laurie E. Cutting</a>, <a href="/search/eess?searchtype=author&amp;query=Landman%2C+B+A">Bennett A. Landman</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.01351v1-abstract-short" style="display: inline;"> To date, there has been no comprehensive study characterizing the effect of diffusion-weighted magnetic resonance imaging voxel resolution on the resulting connectome for high resolution subject data. Similarity in results improved with higher resolution, even after initial down-sampling. To ensure robust tractography and connectomes, resample data to 1 mm isotropic resolution. </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.01351v1-abstract-full" style="display: none;"> To date, there has been no comprehensive study characterizing the effect of diffusion-weighted magnetic resonance imaging voxel resolution on the resulting connectome for high resolution subject data. Similarity in results improved with higher resolution, even after initial down-sampling. To ensure robust tractography and connectomes, resample data to 1 mm isotropic resolution. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.01351v1-abstract-full').style.display = 'none'; document.getElementById('2408.01351v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.20893">arXiv:2407.20893</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.20893">pdf</a>, <a href="https://arxiv.org/format/2407.20893">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> MambaCapsule: Towards Transparent Cardiac Disease Diagnosis with Electrocardiography Using Mamba Capsule Network </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Xu%2C+Y">Yinlong Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+X">Xiaoqiang Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Kong%2C+Z">Zitai Kong</a>, <a href="/search/eess?searchtype=author&amp;query=Wu%2C+Y">Yixuan Wu</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+Y">Yue Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Lu%2C+Y">Yingzhou Lu</a>, <a href="/search/eess?searchtype=author&amp;query=Gao%2C+H">Honghao Gao</a>, <a href="/search/eess?searchtype=author&amp;query=Wu%2C+J">Jian Wu</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+H">Hongxia Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.20893v1-abstract-short" style="display: inline;"> Cardiac arrhythmia, a condition characterized by irregular heartbeats, often serves as an early indication of various heart ailments. With the advent of deep learning, numerous innovative models have been introduced for diagnosing arrhythmias using Electrocardiogram (ECG) signals. However, recent studies solely focus on the performance of models, neglecting the interpretation of their results. Thi&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.20893v1-abstract-full').style.display = 'inline'; document.getElementById('2407.20893v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.20893v1-abstract-full" style="display: none;"> Cardiac arrhythmia, a condition characterized by irregular heartbeats, often serves as an early indication of various heart ailments. With the advent of deep learning, numerous innovative models have been introduced for diagnosing arrhythmias using Electrocardiogram (ECG) signals. However, recent studies solely focus on the performance of models, neglecting the interpretation of their results. This leads to a considerable lack of transparency, posing a significant risk in the actual diagnostic process. To solve this problem, this paper introduces MambaCapsule, a deep neural networks for ECG arrhythmias classification, which increases the explainability of the model while enhancing the accuracy.Our model utilizes Mamba for feature extraction and Capsule networks for prediction, providing not only a confidence score but also signal features. Akin to the processing mechanism of human brain, the model learns signal features and their relationship between them by reconstructing ECG signals in the predicted selection. The model evaluation was conducted on MIT-BIH and PTB dataset, following the AAMI standard. MambaCapsule has achieved a total accuracy of 99.54% and 99.59% on the test sets respectively. These results demonstrate the promising performance of under the standard test protocol. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.20893v1-abstract-full').style.display = 'none'; document.getElementById('2407.20893v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.20251">arXiv:2407.20251</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.20251">pdf</a>, <a href="https://arxiv.org/format/2407.20251">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Materials Science">cond-mat.mtrl-sci</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> An Uncertainty-aware Deep Learning Framework-based Robust Design Optimization of Metamaterial Units </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Wang%2C+Z">Zihan Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Bhaduri%2C+A">Anindya Bhaduri</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+H">Hongyi Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+L">Liping Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.20251v1-abstract-short" style="display: inline;"> Mechanical metamaterials represent an innovative class of artificial structures, distinguished by their extraordinary mechanical characteristics, which are beyond the scope of traditional natural materials. The use of deep generative models has become increasingly popular in the design of metamaterial units. The effectiveness of using deep generative models lies in their capacity to compress compl&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.20251v1-abstract-full').style.display = 'inline'; document.getElementById('2407.20251v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.20251v1-abstract-full" style="display: none;"> Mechanical metamaterials represent an innovative class of artificial structures, distinguished by their extraordinary mechanical characteristics, which are beyond the scope of traditional natural materials. The use of deep generative models has become increasingly popular in the design of metamaterial units. The effectiveness of using deep generative models lies in their capacity to compress complex input data into a simplified, lower-dimensional latent space, while also enabling the creation of novel optimal designs through sampling within this space. However, the design process does not take into account the effect of model uncertainty due to data sparsity or the effect of input data uncertainty due to inherent randomness in the data. This might lead to the generation of undesirable structures with high sensitivity to the uncertainties in the system. To address this issue, a novel uncertainty-aware deep learning framework-based robust design approach is proposed for the design of metamaterial units with optimal target properties. The proposed approach utilizes the probabilistic nature of the deep learning framework and quantifies both aleatoric and epistemic uncertainties associated with surrogate-based design optimization. We demonstrate that the proposed design approach is capable of designing high-performance metamaterial units with high reliability. To showcase the effectiveness of the proposed design approach, a single-objective design optimization problem and a multi-objective design optimization problem are presented. The optimal robust designs obtained are validated by comparing them to the designs obtained from the topology optimization method as well as the designs obtained from a deterministic deep learning framework-based design optimization where none of the uncertainties in the system are explicitly considered. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.20251v1-abstract-full').style.display = 'none'; document.getElementById('2407.20251v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.16986">arXiv:2407.16986</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.16986">pdf</a>, <a href="https://arxiv.org/format/2407.16986">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1049/ipr2.12920">10.1049/ipr2.12920 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Cuboid-Net: A Multi-Branch Convolutional Neural Network for Joint Space-Time Video Super Resolution </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Fu%2C+C">Congrui Fu</a>, <a href="/search/eess?searchtype=author&amp;query=Yuan%2C+H">Hui Yuan</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+H">Hongji Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+H">Hao Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Shen%2C+L">Liquan Shen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.16986v1-abstract-short" style="display: inline;"> The demand for high-resolution videos has been consistently rising across various domains, propelled by continuous advancements in science, technology, and societal. Nonetheless, challenges arising from limitations in imaging equipment capabilities, imaging conditions, as well as economic and temporal factors often result in obtaining low-resolution images in particular situations. Space-time vide&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.16986v1-abstract-full').style.display = 'inline'; document.getElementById('2407.16986v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.16986v1-abstract-full" style="display: none;"> The demand for high-resolution videos has been consistently rising across various domains, propelled by continuous advancements in science, technology, and societal. Nonetheless, challenges arising from limitations in imaging equipment capabilities, imaging conditions, as well as economic and temporal factors often result in obtaining low-resolution images in particular situations. Space-time video super-resolution aims to enhance the spatial and temporal resolutions of low-resolution and low-frame-rate videos. The currently available space-time video super-resolution methods often fail to fully exploit the abundant information existing within the spatio-temporal domain. To address this problem, we tackle the issue by conceptualizing the input low-resolution video as a cuboid structure. Drawing on this perspective, we introduce an innovative methodology called &#34;Cuboid-Net,&#34; which incorporates a multi-branch convolutional neural network. Cuboid-Net is designed to collectively enhance the spatial and temporal resolutions of videos, enabling the extraction of rich and meaningful information across both spatial and temporal dimensions. Specifically, we take the input video as a cuboid to generate different directional slices as input for different branches of the network. The proposed network contains four modules, i.e., a multi-branch-based hybrid feature extraction (MBFE) module, a multi-branch-based reconstruction (MBR) module, a first stage quality enhancement (QE) module, and a second stage cross frame quality enhancement (CFQE) module for interpolated frames only. Experimental results demonstrate that the proposed method is not only effective for spatial and temporal super-resolution of video but also for spatial and angular super-resolution of light field. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.16986v1-abstract-full').style.display = 'none'; document.getElementById('2407.16986v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.06519">arXiv:2407.06519</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.06519">pdf</a>, <a href="https://arxiv.org/format/2407.06519">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> F2PAD: A General Optimization Framework for Feature-Level to Pixel-Level Anomaly Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Tao%2C+C">Chengyu Tao</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+H">Hao Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Du%2C+J">Juan Du</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.06519v2-abstract-short" style="display: inline;"> Image-based inspection systems have been widely deployed in manufacturing production lines. Due to the scarcity of defective samples, unsupervised anomaly detection that only leverages normal samples during training to detect various defects is popular. Existing feature-based methods, utilizing deep features from pretrained neural networks, show their impressive performance in anomaly localization&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.06519v2-abstract-full').style.display = 'inline'; document.getElementById('2407.06519v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.06519v2-abstract-full" style="display: none;"> Image-based inspection systems have been widely deployed in manufacturing production lines. Due to the scarcity of defective samples, unsupervised anomaly detection that only leverages normal samples during training to detect various defects is popular. Existing feature-based methods, utilizing deep features from pretrained neural networks, show their impressive performance in anomaly localization and the low demand for the sample size for training. However, the detected anomalous regions of these methods always exhibit inaccurate boundaries, which impedes the downstream tasks. This deficiency is caused: (i) The decreased resolution of high-level features compared with the original image, and (ii) The mixture of adjacent normal and anomalous pixels during feature extraction. To address them, we propose a novel unified optimization framework (F2PAD) that leverages the Feature-level information to guide the optimization process for Pixel-level Anomaly Detection in the inference stage. The proposed framework is universal and plug-and-play, which can enhance various feature-based methods with limited assumptions. Case studies are provided to demonstrate the effectiveness of our strategy, particularly when applied to three popular backbone methods: PaDiM, CFLOW-AD, and PatchCore. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.06519v2-abstract-full').style.display = 'none'; document.getElementById('2407.06519v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.05791">arXiv:2407.05791</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.05791">pdf</a>, <a href="https://arxiv.org/format/2407.05791">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Joint Beamforming and Antenna Design for Near-Field Fluid Antenna System </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Chen%2C+Y">Yixuan Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+M">Mingzhe Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+H">Hao Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Yang%2C+Z">Zhaohui Yang</a>, <a href="/search/eess?searchtype=author&amp;query=Wong%2C+K">Kai-Kit Wong</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+Z">Zhaoyang Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.05791v1-abstract-short" style="display: inline;"> In this letter, we study the energy efficiency maximization problem for a fluid antenna system (FAS) in near field communications. Specifically, we consider a point-to-point near-field system where the base station (BS) transmitter has multiple fixed-position antennas and the user receives the signals with multiple fluid antennas. Our objective is to jointly optimize the transmit beamforming of th&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.05791v1-abstract-full').style.display = 'inline'; document.getElementById('2407.05791v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.05791v1-abstract-full" style="display: none;"> In this letter, we study the energy efficiency maximization problem for a fluid antenna system (FAS) in near field communications. Specifically, we consider a point-to-point near-field system where the base station (BS) transmitter has multiple fixed-position antennas and the user receives the signals with multiple fluid antennas. Our objective is to jointly optimize the transmit beamforming of the BS and the fluid antenna positions at the user for maximizing the energy efficiency. Our scheme is based on an alternating optimization algorithm that iteratively solves the beamforming and antenna position subproblems. Our simulation results validate the performance improvement of the proposed algorithm and confirm the effectiveness of FAS. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.05791v1-abstract-full').style.display = 'none'; document.getElementById('2407.05791v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.04675">arXiv:2407.04675</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.04675">pdf</a>, <a href="https://arxiv.org/format/2407.04675">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Seed-ASR: Understanding Diverse Speech and Contexts with LLM-based Speech Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Bai%2C+Y">Ye Bai</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+J">Jingping Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+J">Jitong Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+W">Wei Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+Z">Zhuo Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Ding%2C+C">Chuang Ding</a>, <a href="/search/eess?searchtype=author&amp;query=Dong%2C+L">Linhao Dong</a>, <a href="/search/eess?searchtype=author&amp;query=Dong%2C+Q">Qianqian Dong</a>, <a href="/search/eess?searchtype=author&amp;query=Du%2C+Y">Yujiao Du</a>, <a href="/search/eess?searchtype=author&amp;query=Gao%2C+K">Kepan Gao</a>, <a href="/search/eess?searchtype=author&amp;query=Gao%2C+L">Lu Gao</a>, <a href="/search/eess?searchtype=author&amp;query=Guo%2C+Y">Yi Guo</a>, <a href="/search/eess?searchtype=author&amp;query=Han%2C+M">Minglun Han</a>, <a href="/search/eess?searchtype=author&amp;query=Han%2C+T">Ting Han</a>, <a href="/search/eess?searchtype=author&amp;query=Hu%2C+W">Wenchao Hu</a>, <a href="/search/eess?searchtype=author&amp;query=Hu%2C+X">Xinying Hu</a>, <a href="/search/eess?searchtype=author&amp;query=Hu%2C+Y">Yuxiang Hu</a>, <a href="/search/eess?searchtype=author&amp;query=Hua%2C+D">Deyu Hua</a>, <a href="/search/eess?searchtype=author&amp;query=Huang%2C+L">Lu Huang</a>, <a href="/search/eess?searchtype=author&amp;query=Huang%2C+M">Mingkun Huang</a>, <a href="/search/eess?searchtype=author&amp;query=Huang%2C+Y">Youjia Huang</a>, <a href="/search/eess?searchtype=author&amp;query=Jin%2C+J">Jishuo Jin</a>, <a href="/search/eess?searchtype=author&amp;query=Kong%2C+F">Fanliu Kong</a>, <a href="/search/eess?searchtype=author&amp;query=Lan%2C+Z">Zongwei Lan</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+T">Tianyu Li</a> , et al. (30 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.04675v2-abstract-short" style="display: inline;"> Modern automatic speech recognition (ASR) model is required to accurately transcribe diverse speech signals (from different domains, languages, accents, etc) given the specific contextual information in various application scenarios. Classic end-to-end models fused with extra language models perform well, but mainly in data matching scenarios and are gradually approaching a bottleneck. In this wor&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.04675v2-abstract-full').style.display = 'inline'; document.getElementById('2407.04675v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.04675v2-abstract-full" style="display: none;"> Modern automatic speech recognition (ASR) model is required to accurately transcribe diverse speech signals (from different domains, languages, accents, etc) given the specific contextual information in various application scenarios. Classic end-to-end models fused with extra language models perform well, but mainly in data matching scenarios and are gradually approaching a bottleneck. In this work, we introduce Seed-ASR, a large language model (LLM) based speech recognition model. Seed-ASR is developed based on the framework of audio conditioned LLM (AcLLM), leveraging the capabilities of LLMs by inputting continuous speech representations together with contextual information into the LLM. Through stage-wise large-scale training and the elicitation of context-aware capabilities in LLM, Seed-ASR demonstrates significant improvement over end-to-end models on comprehensive evaluation sets, including multiple domains, accents/dialects and languages. Additionally, Seed-ASR can be further deployed to support specific needs in various scenarios without requiring extra language models. Compared to recently released large ASR models, Seed-ASR achieves 10%-40% reduction in word (or character, for Chinese) error rates on Chinese and English public test sets, further demonstrating its powerful performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.04675v2-abstract-full').style.display = 'none'; document.getElementById('2407.04675v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.04368">arXiv:2407.04368</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.04368">pdf</a>, <a href="https://arxiv.org/format/2407.04368">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Romanization Encoding For Multilingual ASR </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Ding%2C+W">Wen Ding</a>, <a href="/search/eess?searchtype=author&amp;query=Jia%2C+F">Fei Jia</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+H">Hainan Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Xi%2C+Y">Yu Xi</a>, <a href="/search/eess?searchtype=author&amp;query=Lai%2C+J">Junjie Lai</a>, <a href="/search/eess?searchtype=author&amp;query=Ginsburg%2C+B">Boris Ginsburg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.04368v1-abstract-short" style="display: inline;"> We introduce romanization encoding for script-heavy languages to optimize multilingual and code-switching Automatic Speech Recognition (ASR) systems. By adopting romanization encoding alongside a balanced concatenated tokenizer within a FastConformer-RNNT framework equipped with a Roman2Char module, we significantly reduce vocabulary and output dimensions, enabling larger training batches and redu&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.04368v1-abstract-full').style.display = 'inline'; document.getElementById('2407.04368v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.04368v1-abstract-full" style="display: none;"> We introduce romanization encoding for script-heavy languages to optimize multilingual and code-switching Automatic Speech Recognition (ASR) systems. By adopting romanization encoding alongside a balanced concatenated tokenizer within a FastConformer-RNNT framework equipped with a Roman2Char module, we significantly reduce vocabulary and output dimensions, enabling larger training batches and reduced memory consumption. Our method decouples acoustic modeling and language modeling, enhancing the flexibility and adaptability of the system. In our study, applying this method to Mandarin-English ASR resulted in a remarkable 63.51% vocabulary reduction and notable performance gains of 13.72% and 15.03% on SEAME code-switching benchmarks. Ablation studies on Mandarin-Korean and Mandarin-Japanese highlight our method&#39;s strong capability to address the complexities of other script-heavy languages, paving the way for more versatile and effective multilingual ASR systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.04368v1-abstract-full').style.display = 'none'; document.getElementById('2407.04368v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.04331">arXiv:2407.04331</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.04331">pdf</a>, <a href="https://arxiv.org/format/2407.04331">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> MuseBarControl: Enhancing Fine-Grained Control in Symbolic Music Generation through Pre-Training and Counterfactual Loss </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Shu%2C+Y">Yangyang Shu</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+H">Haiming Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Zhou%2C+Z">Ziqin Zhou</a>, <a href="/search/eess?searchtype=author&amp;query=Hengel%2C+A+v+d">Anton van den Hengel</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+L">Lingqiao Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.04331v1-abstract-short" style="display: inline;"> Automatically generating symbolic music-music scores tailored to specific human needs-can be highly beneficial for musicians and enthusiasts. Recent studies have shown promising results using extensive datasets and advanced transformer architectures. However, these state-of-the-art models generally offer only basic control over aspects like tempo and style for the entire composition, lacking the a&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.04331v1-abstract-full').style.display = 'inline'; document.getElementById('2407.04331v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.04331v1-abstract-full" style="display: none;"> Automatically generating symbolic music-music scores tailored to specific human needs-can be highly beneficial for musicians and enthusiasts. Recent studies have shown promising results using extensive datasets and advanced transformer architectures. However, these state-of-the-art models generally offer only basic control over aspects like tempo and style for the entire composition, lacking the ability to manage finer details, such as control at the level of individual bars. While fine-tuning a pre-trained symbolic music generation model might seem like a straightforward method for achieving this finer control, our research indicates challenges in this approach. The model often fails to respond adequately to new, fine-grained bar-level control signals. To address this, we propose two innovative solutions. First, we introduce a pre-training task designed to link control signals directly with corresponding musical tokens, which helps in achieving a more effective initialization for subsequent fine-tuning. Second, we implement a novel counterfactual loss that promotes better alignment between the generated music and the control prompts. Together, these techniques significantly enhance our ability to control music generation at the bar level, showing a 13.06\% improvement over conventional methods. Our subjective evaluations also confirm that this enhanced control does not compromise the musical quality of the original pre-trained generative model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.04331v1-abstract-full').style.display = 'none'; document.getElementById('2407.04331v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Demo is available at: https://ganperf.github.io/musebarcontrol.github.io/musebarcontrol/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.03449">arXiv:2407.03449</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.03449">pdf</a>, <a href="https://arxiv.org/format/2407.03449">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> A Tutorial on Fluid Antenna System for 6G Networks: Encompassing Communication Theory, Optimization Methods and Hardware Designs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=New%2C+W+K">Wee Kiat New</a>, <a href="/search/eess?searchtype=author&amp;query=Wong%2C+K">Kai-Kit Wong</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+H">Hao Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+C">Chao Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Ghadi%2C+F+R">Farshad Rostami Ghadi</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+J">Jichen Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Rao%2C+J">Junhui Rao</a>, <a href="/search/eess?searchtype=author&amp;query=Murch%2C+R">Ross Murch</a>, <a href="/search/eess?searchtype=author&amp;query=Ram%C3%ADrez-Espinosa%2C+P">Pablo Ram铆rez-Espinosa</a>, <a href="/search/eess?searchtype=author&amp;query=Morales-Jimenez%2C+D">David Morales-Jimenez</a>, <a href="/search/eess?searchtype=author&amp;query=Chae%2C+C">Chan-Byoung Chae</a>, <a href="/search/eess?searchtype=author&amp;query=Tong%2C+K">Kin-Fai Tong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.03449v2-abstract-short" style="display: inline;"> The advent of the sixth-generation (6G) networks presents another round of revolution for the mobile communication landscape, promising an immersive experience, robust reliability, minimal latency, extreme connectivity, ubiquitous coverage, and capabilities beyond communication, including intelligence and sensing. To achieve these ambitious goals, it is apparent that 6G networks need to incorporat&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.03449v2-abstract-full').style.display = 'inline'; document.getElementById('2407.03449v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.03449v2-abstract-full" style="display: none;"> The advent of the sixth-generation (6G) networks presents another round of revolution for the mobile communication landscape, promising an immersive experience, robust reliability, minimal latency, extreme connectivity, ubiquitous coverage, and capabilities beyond communication, including intelligence and sensing. To achieve these ambitious goals, it is apparent that 6G networks need to incorporate the state-of-the-art technologies. One of the technologies that has garnered rising interest is fluid antenna system (FAS) which represents any software-controllable fluidic, conductive, or dielectric structure capable of dynamically changing its shape and position to reconfigure essential radio-frequency (RF) characteristics. Compared to traditional antenna systems (TASs) with fixed-position radiating elements, the core idea of FAS revolves around the unique flexibility of reconfiguring the radiating elements within a given space. One recent driver of FAS is the recognition of its position-flexibility as a new degree of freedom (dof) to harness diversity and multiplexing gains. In this paper, we provide a comprehensive tutorial, covering channel modeling, signal processing and estimation methods, information-theoretic insights, new multiple access techniques, and hardware designs. Moreover, we delineate the challenges of FAS and explore the potential of using FAS to improve the performance of other contemporary technologies. By providing insights and guidance, this tutorial paper serves to inspire researchers to explore new horizons and fully unleash the potential of FAS. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.03449v2-abstract-full').style.display = 'none'; document.getElementById('2407.03449v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">53 pages, 45 figures, 5 tables. Accepted by IEEE Communications Surveys and Tutorials</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.02400">arXiv:2407.02400</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.02400">pdf</a>, <a href="https://arxiv.org/ps/2407.02400">ps</a>, <a href="https://arxiv.org/format/2407.02400">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/LCOMM.2024.3418338">10.1109/LCOMM.2024.3418338 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Coding-Enhanced Cooperative Jamming for Secret Communication in Fluid Antenna Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Xu%2C+H">Hao Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Wong%2C+K">Kai-Kit Wong</a>, <a href="/search/eess?searchtype=author&amp;query=New%2C+W+K">Wee Kiat New</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+G">Guyue Li</a>, <a href="/search/eess?searchtype=author&amp;query=Ghadi%2C+F+R">Farshad Rostami Ghadi</a>, <a href="/search/eess?searchtype=author&amp;query=Zhu%2C+Y">Yongxu Zhu</a>, <a href="/search/eess?searchtype=author&amp;query=Jin%2C+S">Shi Jin</a>, <a href="/search/eess?searchtype=author&amp;query=Chae%2C+C">Chan-Byoung Chae</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+Y">Yangyang Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.02400v1-abstract-short" style="display: inline;"> This letter investigates the secret communication problem for a fluid antenna system (FAS)-assisted wiretap channel, where the legitimate transmitter transmits an information-bearing signal to the legitimate receiver, and at the same time, transmits a jamming signal to interfere with the eavesdropper (Eve). Unlike the conventional jamming scheme, which usually transmits Gaussian noise that interfe&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.02400v1-abstract-full').style.display = 'inline'; document.getElementById('2407.02400v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.02400v1-abstract-full" style="display: none;"> This letter investigates the secret communication problem for a fluid antenna system (FAS)-assisted wiretap channel, where the legitimate transmitter transmits an information-bearing signal to the legitimate receiver, and at the same time, transmits a jamming signal to interfere with the eavesdropper (Eve). Unlike the conventional jamming scheme, which usually transmits Gaussian noise that interferes not only with Eve but also with the legitimate receiver, in this letter, we consider that encoded codewords are transmitted to jam Eve. Then, by employing appropriate coding schemes, the legitimate receiver can successfully decode the jamming signal and then cancel the interference, while Eve cannot, even if it knows the codebooks. We aim to maximize the secrecy rate through port selection and power control. Although the problem is non-convex, we show that the optimal solution can be found. Simulation results show that by using the FAS technique and the proposed jamming scheme, the secrecy rate of the system can be significantly increased. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.02400v1-abstract-full').style.display = 'none'; document.getElementById('2407.02400v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">6 pages, 3 figures, this paper has been accepted by IEEE Communications Letters</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.13413">arXiv:2406.13413</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.13413">pdf</a>, <a href="https://arxiv.org/format/2406.13413">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Recurrent Inference Machine for Medical Image Registration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+Y">Yi Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Zhao%2C+Y">Yidong Zhao</a>, <a href="/search/eess?searchtype=author&amp;query=Xue%2C+H">Hui Xue</a>, <a href="/search/eess?searchtype=author&amp;query=Kellman%2C+P">Peter Kellman</a>, <a href="/search/eess?searchtype=author&amp;query=Klein%2C+S">Stefan Klein</a>, <a href="/search/eess?searchtype=author&amp;query=Tao%2C+Q">Qian Tao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.13413v1-abstract-short" style="display: inline;"> Image registration is essential for medical image applications where alignment of voxels across multiple images is needed for qualitative or quantitative analysis. With recent advancements in deep neural networks and parallel computing, deep learning-based medical image registration methods become competitive with their flexible modelling and fast inference capabilities. However, compared to tradi&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.13413v1-abstract-full').style.display = 'inline'; document.getElementById('2406.13413v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.13413v1-abstract-full" style="display: none;"> Image registration is essential for medical image applications where alignment of voxels across multiple images is needed for qualitative or quantitative analysis. With recent advancements in deep neural networks and parallel computing, deep learning-based medical image registration methods become competitive with their flexible modelling and fast inference capabilities. However, compared to traditional optimization-based registration methods, the speed advantage may come at the cost of registration performance at inference time. Besides, deep neural networks ideally demand large training datasets while optimization-based methods are training-free. To improve registration accuracy and data efficiency, we propose a novel image registration method, termed Recurrent Inference Image Registration (RIIR) network. RIIR is formulated as a meta-learning solver to the registration problem in an iterative manner. RIIR addresses the accuracy and data efficiency issues, by learning the update rule of optimization, with implicit regularization combined with explicit gradient input. We evaluated RIIR extensively on brain MRI and quantitative cardiac MRI datasets, in terms of both registration accuracy and training data efficiency. Our experiments showed that RIIR outperformed a range of deep learning-based methods, even with only $5\%$ of the training data, demonstrating high data efficiency. Key findings from our ablation studies highlighted the important added value of the hidden states introduced in the recurrent inference framework for meta-learning. Our proposed RIIR offers a highly data-efficient framework for deep learning-based medical image registration. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.13413v1-abstract-full').style.display = 'none'; document.getElementById('2406.13413v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Preprint</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.11653">arXiv:2406.11653</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.11653">pdf</a>, <a href="https://arxiv.org/format/2406.11653">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Communication-Efficient MARL for Platoon Stability and Energy-efficiency Co-optimization in Cooperative Adaptive Cruise Control of CAVs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Hua%2C+M">Min Hua</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+D">Dong Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Jiang%2C+K">Kun Jiang</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+F">Fanggang Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+J">Jinhai Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+B">Bo Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Zhou%2C+Q">Quan Zhou</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+H">Hongming Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.11653v1-abstract-short" style="display: inline;"> Cooperative adaptive cruise control (CACC) has been recognized as a fundamental function of autonomous driving, in which platoon stability and energy efficiency are outstanding challenges that are difficult to accommodate in real-world operations. This paper studied the CACC of connected and autonomous vehicles (CAVs) based on the multi-agent reinforcement learning algorithm (MARL) to optimize pla&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.11653v1-abstract-full').style.display = 'inline'; document.getElementById('2406.11653v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.11653v1-abstract-full" style="display: none;"> Cooperative adaptive cruise control (CACC) has been recognized as a fundamental function of autonomous driving, in which platoon stability and energy efficiency are outstanding challenges that are difficult to accommodate in real-world operations. This paper studied the CACC of connected and autonomous vehicles (CAVs) based on the multi-agent reinforcement learning algorithm (MARL) to optimize platoon stability and energy efficiency simultaneously. The optimal use of communication bandwidth is the key to guaranteeing learning performance in real-world driving, and thus this paper proposes a communication-efficient MARL by incorporating the quantified stochastic gradient descent (QSGD) and a binary differential consensus (BDC) method into a fully-decentralized MARL framework. We benchmarked the performance of our proposed BDC-MARL algorithm against several several non-communicative andcommunicative MARL algorithms, e.g., IA2C, FPrint, and DIAL, through the evaluation of platoon stability, fuel economy, and driving comfort. Our results show that BDC-MARL achieved the highest energy savings, improving by up to 5.8%, with an average velocity of 15.26 m/s and an inter-vehicle spacing of 20.76 m. In addition, we conducted different information-sharing analyses to assess communication efficacy, along with sensitivity analyses and scalability tests with varying platoon sizes. The practical effectiveness of our approach is further demonstrated using real-world scenarios sourced from open-sourced OpenACC. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.11653v1-abstract-full').style.display = 'none'; document.getElementById('2406.11653v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.11446">arXiv:2406.11446</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.11446">pdf</a>, <a href="https://arxiv.org/format/2406.11446">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> An Approximate Wave-Number Domain Expression for Near-Field XL-MIMO Channel </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Xing%2C+H">Hongbo Xing</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+Y">Yuxiang Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+J">Jianhua Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+H">Huixin Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+G">Guangyi Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+Q">Qixing Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.11446v2-abstract-short" style="display: inline;"> As Extremely Large-Scale Multiple-Input-Multiple-Output (XL-MIMO) technology advances and carrier frequency rises, the near-field effects in communication are intensifying. A concise and accurate near-field XL-MIMO channel model serves as the cornerstone for investigating the near-field effects. However, existing wave-number domain XL-MIMO channel models under near-field conditions require non-clo&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.11446v2-abstract-full').style.display = 'inline'; document.getElementById('2406.11446v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.11446v2-abstract-full" style="display: none;"> As Extremely Large-Scale Multiple-Input-Multiple-Output (XL-MIMO) technology advances and carrier frequency rises, the near-field effects in communication are intensifying. A concise and accurate near-field XL-MIMO channel model serves as the cornerstone for investigating the near-field effects. However, existing wave-number domain XL-MIMO channel models under near-field conditions require non-closed-form oscillatory integrals for computation, making it difficult to analyze the channel characteristics in closed-form. To obtain a more succinct channel model, this paper introduces a closed-form approximate expression based on the principle of stationary phase. It was subsequently shown that when the scatterer distance is larger than the array aperture, the closed-form model can be further simplified as a trapezoidal spectrum. We validate the accuracy of the proposed approximation through simulations of power angular spectrum similarity. The results indicate that the proposed approximation can accurately approximate the near-field wave-number domain channel within the effective Rayleigh distance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.11446v2-abstract-full').style.display = 'none'; document.getElementById('2406.11446v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.11265">arXiv:2406.11265</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.11265">pdf</a>, <a href="https://arxiv.org/ps/2406.11265">ps</a>, <a href="https://arxiv.org/format/2406.11265">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Balancing Performance and Cost for Two-Hop Cooperative Communications: Stackelberg Game and Distributed Multi-Agent Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Geng%2C+Y">Yuanzhe Geng</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+E">Erwu Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Ni%2C+W">Wei Ni</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+R">Rui Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+Y">Yan Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+H">Hao Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Cai%2C+C">Chen Cai</a>, <a href="/search/eess?searchtype=author&amp;query=Jamalipour%2C+A">Abbas Jamalipour</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.11265v1-abstract-short" style="display: inline;"> This paper aims to balance performance and cost in a two-hop wireless cooperative communication network where the source and relays have contradictory optimization goals and make decisions in a distributed manner. This differs from most existing works that have typically assumed that source and relay nodes follow a schedule created implicitly by a central controller. We propose that the relays for&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.11265v1-abstract-full').style.display = 'inline'; document.getElementById('2406.11265v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.11265v1-abstract-full" style="display: none;"> This paper aims to balance performance and cost in a two-hop wireless cooperative communication network where the source and relays have contradictory optimization goals and make decisions in a distributed manner. This differs from most existing works that have typically assumed that source and relay nodes follow a schedule created implicitly by a central controller. We propose that the relays form an alliance in an attempt to maximize the benefit of relaying while the source aims to increase the channel capacity cost-effectively. To this end, we establish the trade problem as a Stackelberg game, and prove the existence of its equilibrium. Another important aspect is that we use multi-agent reinforcement learning (MARL) to approach the equilibrium in a situation where the instantaneous channel state information (CSI) is unavailable, and the source and relays do not have knowledge of each other&#39;s goal. A multi-agent deep deterministic policy gradient-based framework is designed, where the relay alliance and the source act as agents. Experiments demonstrate that the proposed method can obtain an acceptable performance that is close to the game-theoretic equilibrium for all players under time-invariant environments, which considerably outperforms its potential alternatives and is only about 2.9% away from the optimal solution. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.11265v1-abstract-full').style.display = 'none'; document.getElementById('2406.11265v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.10160">arXiv:2406.10160</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.10160">pdf</a>, <a href="https://arxiv.org/format/2406.10160">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> One-pass Multiple Conformer and Foundation Speech Systems Compression and Quantization Using An All-in-one Neural Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Li%2C+Z">Zhaoqing Li</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+H">Haoning Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+T">Tianzi Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Hu%2C+S">Shoukang Hu</a>, <a href="/search/eess?searchtype=author&amp;query=Jin%2C+Z">Zengrui Jin</a>, <a href="/search/eess?searchtype=author&amp;query=Hu%2C+S">Shujie Hu</a>, <a href="/search/eess?searchtype=author&amp;query=Deng%2C+J">Jiajun Deng</a>, <a href="/search/eess?searchtype=author&amp;query=Cui%2C+M">Mingyu Cui</a>, <a href="/search/eess?searchtype=author&amp;query=Geng%2C+M">Mengzhe Geng</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+X">Xunying Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.10160v1-abstract-short" style="display: inline;"> We propose a novel one-pass multiple ASR systems joint compression and quantization approach using an all-in-one neural model. A single compression cycle allows multiple nested systems with varying Encoder depths, widths, and quantization precision settings to be simultaneously constructed without the need to train and store individual target systems separately. Experiments consistently demonstrat&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.10160v1-abstract-full').style.display = 'inline'; document.getElementById('2406.10160v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.10160v1-abstract-full" style="display: none;"> We propose a novel one-pass multiple ASR systems joint compression and quantization approach using an all-in-one neural model. A single compression cycle allows multiple nested systems with varying Encoder depths, widths, and quantization precision settings to be simultaneously constructed without the need to train and store individual target systems separately. Experiments consistently demonstrate the multiple ASR systems compressed in a single all-in-one model produced a word error rate (WER) comparable to, or lower by up to 1.01\% absolute (6.98\% relative) than individually trained systems of equal complexity. A 3.4x overall system compression and training time speed-up was achieved. Maximum model size compression ratios of 12.8x and 3.93x were obtained over the baseline Switchboard-300hr Conformer and LibriSpeech-100hr fine-tuned wav2vec2.0 models, respectively, incurring no statistically significant WER increase. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.10160v1-abstract-full').style.display = 'none'; document.getElementById('2406.10160v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by Interspeech 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.09656">arXiv:2406.09656</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.09656">pdf</a>, <a href="https://arxiv.org/format/2406.09656">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> RSEND: Retinex-based Squeeze and Excitation Network with Dark Region Detection for Efficient Low Light Image Enhancement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Li%2C+J">Jingcheng Li</a>, <a href="/search/eess?searchtype=author&amp;query=Qiao%2C+Y">Ye Qiao</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+H">Haocheng Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Huang%2C+S">Sitao Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.09656v1-abstract-short" style="display: inline;"> Images captured under low-light scenarios often suffer from low quality. Previous CNN-based deep learning methods often involve using Retinex theory. Nevertheless, most of them cannot perform well in more complicated datasets like LOL-v2 while consuming too much computational resources. Besides, some of these methods require sophisticated training at different stages, making the procedure even mor&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.09656v1-abstract-full').style.display = 'inline'; document.getElementById('2406.09656v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.09656v1-abstract-full" style="display: none;"> Images captured under low-light scenarios often suffer from low quality. Previous CNN-based deep learning methods often involve using Retinex theory. Nevertheless, most of them cannot perform well in more complicated datasets like LOL-v2 while consuming too much computational resources. Besides, some of these methods require sophisticated training at different stages, making the procedure even more time-consuming and tedious. In this paper, we propose a more accurate, concise, and one-stage Retinex theory based framework, RSEND. RSEND first divides the low-light image into the illumination map and reflectance map, then captures the important details in the illumination map and performs light enhancement. After this step, it refines the enhanced gray-scale image and does element-wise matrix multiplication with the reflectance map. By denoising the output it has from the previous step, it obtains the final result. In all the steps, RSEND utilizes Squeeze and Excitation network to better capture the details. Comprehensive quantitative and qualitative experiments show that our Efficient Retinex model significantly outperforms other CNN-based models, achieving a PSNR improvement ranging from 0.44 dB to 4.2 dB in different datasets and even outperforms transformer-based models in the LOL-v2-real dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.09656v1-abstract-full').style.display = 'none'; document.getElementById('2406.09656v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.07256">arXiv:2406.07256</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.07256">pdf</a>, <a href="https://arxiv.org/ps/2406.07256">ps</a>, <a href="https://arxiv.org/format/2406.07256">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> AS-70: A Mandarin stuttered speech dataset for automatic speech recognition and stuttering event detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Gong%2C+R">Rong Gong</a>, <a href="/search/eess?searchtype=author&amp;query=Xue%2C+H">Hongfei Xue</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+L">Lezhi Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+X">Xin Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+Q">Qisheng Li</a>, <a href="/search/eess?searchtype=author&amp;query=Xie%2C+L">Lei Xie</a>, <a href="/search/eess?searchtype=author&amp;query=Bu%2C+H">Hui Bu</a>, <a href="/search/eess?searchtype=author&amp;query=Wu%2C+S">Shaomei Wu</a>, <a href="/search/eess?searchtype=author&amp;query=Zhou%2C+J">Jiaming Zhou</a>, <a href="/search/eess?searchtype=author&amp;query=Qin%2C+Y">Yong Qin</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+B">Binbin Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Du%2C+J">Jun Du</a>, <a href="/search/eess?searchtype=author&amp;query=Bin%2C+J">Jia Bin</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+M">Ming Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.07256v1-abstract-short" style="display: inline;"> The rapid advancements in speech technologies over the past two decades have led to human-level performance in tasks like automatic speech recognition (ASR) for fluent speech. However, the efficacy of these models diminishes when applied to atypical speech, such as stuttering. This paper introduces AS-70, the first publicly available Mandarin stuttered speech dataset, which stands out as the large&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.07256v1-abstract-full').style.display = 'inline'; document.getElementById('2406.07256v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.07256v1-abstract-full" style="display: none;"> The rapid advancements in speech technologies over the past two decades have led to human-level performance in tasks like automatic speech recognition (ASR) for fluent speech. However, the efficacy of these models diminishes when applied to atypical speech, such as stuttering. This paper introduces AS-70, the first publicly available Mandarin stuttered speech dataset, which stands out as the largest dataset in its category. Encompassing conversational and voice command reading speech, AS-70 includes verbatim manual transcription, rendering it suitable for various speech-related tasks. Furthermore, baseline systems are established, and experimental results are presented for ASR and stuttering event detection (SED) tasks. By incorporating this dataset into the model fine-tuning, significant improvements in the state-of-the-art ASR models, e.g., Whisper and Hubert, are observed, enhancing their inclusivity in addressing stuttered speech. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.07256v1-abstract-full').style.display = 'none'; document.getElementById('2406.07256v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by Interspeech 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.06643">arXiv:2406.06643</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.06643">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Transforming Heart Chamber Imaging: Self-Supervised Learning for Whole Heart Reconstruction and Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Qayyum%2C+A">Abdul Qayyum</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+H">Hao Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Halliday%2C+B+P">Brian P. Halliday</a>, <a href="/search/eess?searchtype=author&amp;query=Rodero%2C+C">Cristobal Rodero</a>, <a href="/search/eess?searchtype=author&amp;query=Lanyon%2C+C+W">Christopher W. Lanyon</a>, <a href="/search/eess?searchtype=author&amp;query=Wilkinson%2C+R+D">Richard D. Wilkinson</a>, <a href="/search/eess?searchtype=author&amp;query=Niederer%2C+S+A">Steven Alexander Niederer</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.06643v1-abstract-short" style="display: inline;"> Automated segmentation of Cardiac Magnetic Resonance (CMR) plays a pivotal role in efficiently assessing cardiac function, offering rapid clinical evaluations that benefit both healthcare practitioners and patients. While recent research has primarily focused on delineating structures in the short-axis orientation, less attention has been given to long-axis representations, mainly due to the compl&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.06643v1-abstract-full').style.display = 'inline'; document.getElementById('2406.06643v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.06643v1-abstract-full" style="display: none;"> Automated segmentation of Cardiac Magnetic Resonance (CMR) plays a pivotal role in efficiently assessing cardiac function, offering rapid clinical evaluations that benefit both healthcare practitioners and patients. While recent research has primarily focused on delineating structures in the short-axis orientation, less attention has been given to long-axis representations, mainly due to the complex nature of structures in this orientation. Performing pixel-wise segmentation of the left ventricular (LV) myocardium and the four cardiac chambers in 2-D steady-state free precession (SSFP) cine sequences is a crucial preprocessing stage for various analyses. However, the challenge lies in the significant variability in contrast, appearance, orientation, and positioning of the heart across different patients, clinical views, scanners, and imaging protocols. Consequently, achieving fully automatic semantic segmentation in this context is notoriously challenging. In recent years, several deep learning models have been proposed to accurately quantify and diagnose cardiac pathologies. These automated tools heavily rely on the accurate segmentation of cardiac structures in magnetic resonance images (MRI). Hence, there is a need for new methods to handle such structures&#39; geometrical and textural complexities. We proposed 2D and 3D two-stage self-supervised deep learning segmentation hybrid transformer and CNN-based architectures for 4CH whole heart segmentation. Accurate segmentation of the ventricles and atria in 4CH views is crucial for analyzing heart health and reconstructing four-chamber meshes, which are essential for estimating various parameters to assess overall heart condition. Our proposed method outperformed state-of-the-art techniques, demonstrating superior performance in this domain. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.06643v1-abstract-full').style.display = 'none'; document.getElementById('2406.06643v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">arXiv admin note: text overlap with arXiv:2206.07349 by other authors</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.06220">arXiv:2406.06220</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.06220">pdf</a>, <a href="https://arxiv.org/format/2406.06220">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Label-Looping: Highly Efficient Decoding for Transducers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Bataev%2C+V">Vladimir Bataev</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+H">Hainan Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Galvez%2C+D">Daniel Galvez</a>, <a href="/search/eess?searchtype=author&amp;query=Lavrukhin%2C+V">Vitaly Lavrukhin</a>, <a href="/search/eess?searchtype=author&amp;query=Ginsburg%2C+B">Boris Ginsburg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.06220v2-abstract-short" style="display: inline;"> This paper introduces a highly efficient greedy decoding algorithm for Transducer-based speech recognition models. We redesign the standard nested-loop design for RNN-T decoding, swapping loops over frames and labels: the outer loop iterates over labels, while the inner loop iterates over frames searching for the next non-blank symbol. Additionally, we represent partial hypotheses in a special str&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.06220v2-abstract-full').style.display = 'inline'; document.getElementById('2406.06220v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.06220v2-abstract-full" style="display: none;"> This paper introduces a highly efficient greedy decoding algorithm for Transducer-based speech recognition models. We redesign the standard nested-loop design for RNN-T decoding, swapping loops over frames and labels: the outer loop iterates over labels, while the inner loop iterates over frames searching for the next non-blank symbol. Additionally, we represent partial hypotheses in a special structure using CUDA tensors, supporting parallelized hypotheses manipulations. Experiments show that the label-looping algorithm is up to 2.0X faster than conventional batched decoding when using batch size 32. It can be further combined with other compiler or GPU call-related techniques to achieve even more speedup. Our algorithm is general-purpose and can work with both conventional Transducers and Token-and-Duration Transducers. We open-source our implementation to benefit the research community. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.06220v2-abstract-full').style.display = 'none'; document.getElementById('2406.06220v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at IEEE SLT 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.17100">arXiv:2405.17100</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2405.17100">pdf</a>, <a href="https://arxiv.org/format/2405.17100">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Sok: Comprehensive Security Overview, Challenges, and Future Directions of Voice-Controlled Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Xu%2C+H">Haozhe Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Wu%2C+C">Cong Wu</a>, <a href="/search/eess?searchtype=author&amp;query=Gu%2C+Y">Yangyang Gu</a>, <a href="/search/eess?searchtype=author&amp;query=Shang%2C+X">Xingcan Shang</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+J">Jing Chen</a>, <a href="/search/eess?searchtype=author&amp;query=He%2C+K">Kun He</a>, <a href="/search/eess?searchtype=author&amp;query=Du%2C+R">Ruiying Du</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.17100v1-abstract-short" style="display: inline;"> The integration of Voice Control Systems (VCS) into smart devices and their growing presence in daily life accentuate the importance of their security. Current research has uncovered numerous vulnerabilities in VCS, presenting significant risks to user privacy and security. However, a cohesive and systematic examination of these vulnerabilities and the corresponding solutions is still absent. This&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.17100v1-abstract-full').style.display = 'inline'; document.getElementById('2405.17100v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.17100v1-abstract-full" style="display: none;"> The integration of Voice Control Systems (VCS) into smart devices and their growing presence in daily life accentuate the importance of their security. Current research has uncovered numerous vulnerabilities in VCS, presenting significant risks to user privacy and security. However, a cohesive and systematic examination of these vulnerabilities and the corresponding solutions is still absent. This lack of comprehensive analysis presents a challenge for VCS designers in fully understanding and mitigating the security issues within these systems. Addressing this gap, our study introduces a hierarchical model structure for VCS, providing a novel lens for categorizing and analyzing existing literature in a systematic manner. We classify attacks based on their technical principles and thoroughly evaluate various attributes, such as their methods, targets, vectors, and behaviors. Furthermore, we consolidate and assess the defense mechanisms proposed in current research, offering actionable recommendations for enhancing VCS security. Our work makes a significant contribution by simplifying the complexity inherent in VCS security, aiding designers in effectively identifying and countering potential threats, and setting a foundation for future advancements in VCS security research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.17100v1-abstract-full').style.display = 'none'; document.getElementById('2405.17100v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.15831">arXiv:2405.15831</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2405.15831">pdf</a>, <a href="https://arxiv.org/format/2405.15831">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Transmission Interface Power Flow Adjustment: A Deep Reinforcement Learning Approach based on Multi-task Attribution Map </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Liu%2C+S">Shunyu Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Luo%2C+W">Wei Luo</a>, <a href="/search/eess?searchtype=author&amp;query=Zhou%2C+Y">Yanzhen Zhou</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+K">Kaixuan Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+Q">Quan Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+H">Huating Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Guo%2C+Q">Qinglai Guo</a>, <a href="/search/eess?searchtype=author&amp;query=Song%2C+M">Mingli Song</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.15831v1-abstract-short" style="display: inline;"> Transmission interface power flow adjustment is a critical measure to ensure the security and economy operation of power systems. However, conventional model-based adjustment schemes are limited by the increasing variations and uncertainties occur in power systems, where the adjustment problems of different transmission interfaces are often treated as several independent tasks, ignoring their coup&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.15831v1-abstract-full').style.display = 'inline'; document.getElementById('2405.15831v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.15831v1-abstract-full" style="display: none;"> Transmission interface power flow adjustment is a critical measure to ensure the security and economy operation of power systems. However, conventional model-based adjustment schemes are limited by the increasing variations and uncertainties occur in power systems, where the adjustment problems of different transmission interfaces are often treated as several independent tasks, ignoring their coupling relationship and even leading to conflict decisions. In this paper, we introduce a novel data-driven deep reinforcement learning (DRL) approach, to handle multiple power flow adjustment tasks jointly instead of learning each task from scratch. At the heart of the proposed method is a multi-task attribution map (MAM), which enables the DRL agent to explicitly attribute each transmission interface task to different power system nodes with task-adaptive attention weights. Based on this MAM, the agent can further provide effective strategies to solve the multi-task adjustment problem with a near-optimal operation cost. Simulation results on the IEEE 118-bus system, a realistic 300-bus system in China, and a very large European system with 9241 buses demonstrate that the proposed method significantly improves the performance compared with several baseline methods, and exhibits high interpretability with the learnable MAM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.15831v1-abstract-full').style.display = 'none'; document.getElementById('2405.15831v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by IEEE Transactions on Power Systems</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.15607">arXiv:2405.15607</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2405.15607">pdf</a>, <a href="https://arxiv.org/format/2405.15607">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Channel Estimation and Reconstruction in Fluid Antenna System: Oversampling is Essential </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=New%2C+W+K">Wee Kiat New</a>, <a href="/search/eess?searchtype=author&amp;query=Wong%2C+K">Kai-Kit Wong</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+H">Hao Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Ghadi%2C+F+R">Farshad Rostami Ghadi</a>, <a href="/search/eess?searchtype=author&amp;query=Murch%2C+R">Ross Murch</a>, <a href="/search/eess?searchtype=author&amp;query=Chae%2C+C">Chan-Byoung Chae</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.15607v2-abstract-short" style="display: inline;"> Fluid antenna system (FAS) has recently surfaced as a promising technology for the upcoming sixth generation (6G) wireless networks. Unlike traditional antenna system (TAS) with fixed antenna location, FAS introduces a flexible component in which the radiating element can switch its position within a predefined space. This capability allows FAS to achieve additional diversity and multiplexing gain&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.15607v2-abstract-full').style.display = 'inline'; document.getElementById('2405.15607v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.15607v2-abstract-full" style="display: none;"> Fluid antenna system (FAS) has recently surfaced as a promising technology for the upcoming sixth generation (6G) wireless networks. Unlike traditional antenna system (TAS) with fixed antenna location, FAS introduces a flexible component in which the radiating element can switch its position within a predefined space. This capability allows FAS to achieve additional diversity and multiplexing gains. Nevertheless, to fully reap the benefits of FAS, obtaining channel state information (CSI) over the predefined space is crucial. In this paper, we study the system with a transmitter equipped with a traditional fixed antenna and a receiver with a fluid antenna by considering an electromagnetic-compliant channel model. We address the challenges of channel estimation and reconstruction using Nyquist sampling and maximum likelihood estimation (MLE) methods. Our analysis reveals a fundamental tradeoff between the accuracy of the reconstructed channel and the number of estimated channels, indicating that half-wavelength sampling is insufficient for perfect reconstruction and that oversampling is essential to enhance accuracy. Despite its advantages, oversampling can introduce practical challenges. Consequently, we propose a suboptimal sampling distance that facilitates efficient channel reconstruction. In addition, we employ the MLE method to bound the channel estimation error by $蔚$, with a specific confidence interval (CI). Our findings enable us to determine the minimum number of estimated channels and the total number of pilot symbols required for efficient channel reconstruction in a given space. Lastly, we investigate the rate performance of FAS and TAS and demonstrate that FAS with imperfect CSI can outperform TAS with perfect CSI. In contrast to existing works, we also show that there is an optimal fluid antenna size that maximizes the achievable rate. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.15607v2-abstract-full').style.display = 'none'; document.getElementById('2405.15607v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 16 figures - including subfigures. Accepted by IEEE TWC</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.05715">arXiv:2405.05715</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2405.05715">pdf</a>, <a href="https://arxiv.org/format/2405.05715">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Shifting the ISAC Trade-Off with Fluid Antenna Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Zou%2C+J">Jiaqi Zou</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+H">Hao Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+C">Chao Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+L">Lvxin Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Sun%2C+S">Songlin Sun</a>, <a href="/search/eess?searchtype=author&amp;query=Meng%2C+K">Kaitao Meng</a>, <a href="/search/eess?searchtype=author&amp;query=Masouros%2C+C">Christos Masouros</a>, <a href="/search/eess?searchtype=author&amp;query=Wong%2C+K">Kai-Kit Wong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.05715v1-abstract-short" style="display: inline;"> As an emerging antenna technology, a fluid antenna system (FAS) enhances spatial diversity to improve both sensing and communication performance by shifting the active antennas among available ports. In this letter, we study the potential of shifting the integrated sensing and communication (ISAC) trade-off with FAS. We propose the model for FAS-enabled ISAC and jointly optimize the transmit beamf&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.05715v1-abstract-full').style.display = 'inline'; document.getElementById('2405.05715v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.05715v1-abstract-full" style="display: none;"> As an emerging antenna technology, a fluid antenna system (FAS) enhances spatial diversity to improve both sensing and communication performance by shifting the active antennas among available ports. In this letter, we study the potential of shifting the integrated sensing and communication (ISAC) trade-off with FAS. We propose the model for FAS-enabled ISAC and jointly optimize the transmit beamforming and port selection of FAS. In particular, we aim to minimize the transmit power, while satisfying both communication and sensing requirements. An efficient iterative algorithm based on sparse optimization, convex approximation, and a penalty approach is developed. The simulation results show that the proposed scheme can attain 33% reductions in transmit power with guaranteed sensing and communication performance, showing the great potential of the fluid antenna for striking a flexible tradeoff between sensing and communication in ISAC systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.05715v1-abstract-full').style.display = 'none'; document.getElementById('2405.05715v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.02361">arXiv:2405.02361</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2405.02361">pdf</a>, <a href="https://arxiv.org/format/2405.02361">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Technical report on target classification in SAR track </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Xu%2C+H">Haonan Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Yinan%2C+H">Han Yinan</a>, <a href="/search/eess?searchtype=author&amp;query=Si%2C+H">Haotian Si</a>, <a href="/search/eess?searchtype=author&amp;query=Yang%2C+Y">Yang Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.02361v1-abstract-short" style="display: inline;"> This report proposes a robust method for classifying oceanic and atmospheric phenomena using synthetic aperture radar (SAR) imagery. Our proposed method leverages the powerful pre-trained model Swin Transformer v2 Large as the backbone and employs carefully designed data augmentation and exponential moving average during training to enhance the model&#39;s generalization capability and stability. In t&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.02361v1-abstract-full').style.display = 'inline'; document.getElementById('2405.02361v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.02361v1-abstract-full" style="display: none;"> This report proposes a robust method for classifying oceanic and atmospheric phenomena using synthetic aperture radar (SAR) imagery. Our proposed method leverages the powerful pre-trained model Swin Transformer v2 Large as the backbone and employs carefully designed data augmentation and exponential moving average during training to enhance the model&#39;s generalization capability and stability. In the testing stage, a method called ReAct is utilized to rectify activation values and utilize Energy Score for more accurate measurement of model uncertainty, significantly improving out-of-distribution detection performance. Furthermore, test time augmentation is employed to enhance classification accuracy and prediction stability. Comprehensive experimental results demonstrate that each additional technique significantly improves classification accuracy, confirming their effectiveness in classifying maritime and atmospheric phenomena in SAR imagery. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.02361v1-abstract-full').style.display = 'none'; document.getElementById('2405.02361v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">arXiv admin note: text overlap with arXiv:2310.06221, arXiv:2111.12797 by other authors</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.02132">arXiv:2405.02132</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2405.02132">pdf</a>, <a href="https://arxiv.org/format/2405.02132">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Unveiling the Potential of LLM-Based ASR on Chinese Open-Source Datasets </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Geng%2C+X">Xuelong Geng</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+T">Tianyi Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Wei%2C+K">Kun Wei</a>, <a href="/search/eess?searchtype=author&amp;query=Mu%2C+B">Bingshen Mu</a>, <a href="/search/eess?searchtype=author&amp;query=Xue%2C+H">Hongfei Xue</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+H">He Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+Y">Yangze Li</a>, <a href="/search/eess?searchtype=author&amp;query=Guo%2C+P">Pengcheng Guo</a>, <a href="/search/eess?searchtype=author&amp;query=Dai%2C+Y">Yuhang Dai</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+L">Longhao Li</a>, <a href="/search/eess?searchtype=author&amp;query=Shao%2C+M">Mingchen Shao</a>, <a href="/search/eess?searchtype=author&amp;query=Xie%2C+L">Lei Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.02132v3-abstract-short" style="display: inline;"> Large Language Models (LLMs) have demonstrated unparalleled effectiveness in various NLP tasks, and integrating LLMs with automatic speech recognition (ASR) is becoming a mainstream paradigm. Building upon this momentum, our research delves into an in-depth examination of this paradigm on a large open-source Chinese dataset. Specifically, our research aims to evaluate the impact of various configu&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.02132v3-abstract-full').style.display = 'inline'; document.getElementById('2405.02132v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.02132v3-abstract-full" style="display: none;"> Large Language Models (LLMs) have demonstrated unparalleled effectiveness in various NLP tasks, and integrating LLMs with automatic speech recognition (ASR) is becoming a mainstream paradigm. Building upon this momentum, our research delves into an in-depth examination of this paradigm on a large open-source Chinese dataset. Specifically, our research aims to evaluate the impact of various configurations of speech encoders, LLMs, and projector modules in the context of the speech foundation encoder-LLM ASR paradigm. Furthermore, we introduce a three-stage training approach, expressly developed to enhance the model&#39;s ability to align auditory and textual information. The implementation of this approach, alongside the strategic integration of ASR components, enabled us to achieve the SOTA performance on the AISHELL-1, Test_Net, and Test_Meeting test sets. Our analysis presents an empirical foundation for future research in LLM-based ASR systems and offers insights into optimizing performance using Chinese datasets. We will publicly release all scripts used for data preparation, training, inference, and scoring, as well as pre-trained models and training logs to promote reproducible research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.02132v3-abstract-full').style.display = 'none'; document.getElementById('2405.02132v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.19167">arXiv:2404.19167</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2404.19167">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Medical Physics">physics.med-ph</span> </div> </div> <p class="title is-5 mathjax"> Advancing low-field MRI with a universal denoising imaging transformer: Towards fast and high-quality imaging </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Zhu%2C+Z">Zheren Zhu</a>, <a href="/search/eess?searchtype=author&amp;query=Rehman%2C+A">Azaan Rehman</a>, <a href="/search/eess?searchtype=author&amp;query=Cao%2C+X">Xiaozhi Cao</a>, <a href="/search/eess?searchtype=author&amp;query=Liao%2C+C">Congyu Liao</a>, <a href="/search/eess?searchtype=author&amp;query=Lee%2C+Y+J">Yoo Jin Lee</a>, <a href="/search/eess?searchtype=author&amp;query=Ohliger%2C+M">Michael Ohliger</a>, <a href="/search/eess?searchtype=author&amp;query=Xue%2C+H">Hui Xue</a>, <a href="/search/eess?searchtype=author&amp;query=Yang%2C+Y">Yang Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.19167v1-abstract-short" style="display: inline;"> Recent developments in low-field (LF) magnetic resonance imaging (MRI) systems present remarkable opportunities for affordable and widespread MRI access. A robust denoising method to overcome the intrinsic low signal-noise-ratio (SNR) barrier is critical to the success of LF MRI. However, current data-driven MRI denoising methods predominantly handle magnitude images and rely on customized models&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.19167v1-abstract-full').style.display = 'inline'; document.getElementById('2404.19167v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.19167v1-abstract-full" style="display: none;"> Recent developments in low-field (LF) magnetic resonance imaging (MRI) systems present remarkable opportunities for affordable and widespread MRI access. A robust denoising method to overcome the intrinsic low signal-noise-ratio (SNR) barrier is critical to the success of LF MRI. However, current data-driven MRI denoising methods predominantly handle magnitude images and rely on customized models with constrained data diversity and quantity, which exhibit limited generalizability in clinical applications across diverse MRI systems, pulse sequences, and organs. In this study, we present ImT-MRD: a complex-valued imaging transformer trained on a vast number of clinical MRI scans aiming at universal MR denoising at LF systems. Compared with averaging multiple-repeated scans for higher image SNR, the model obtains better image quality from fewer repetitions, demonstrating its capability for accelerating scans under various clinical settings. Moreover, with its complex-valued image input, the model can denoise intermediate results before advanced post-processing and prepare high-quality data for further MRI research. By delivering universal and accurate denoising across clinical and research tasks, our model holds great promise to expedite the evolution of LF MRI for accessible and equal biomedical applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.19167v1-abstract-full').style.display = 'none'; document.getElementById('2404.19167v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.11313">arXiv:2404.11313</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2404.11313">pdf</a>, <a href="https://arxiv.org/format/2404.11313">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> NTIRE 2024 Challenge on Short-form UGC Video Quality Assessment: Methods and Results </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Li%2C+X">Xin Li</a>, <a href="/search/eess?searchtype=author&amp;query=Yuan%2C+K">Kun Yuan</a>, <a href="/search/eess?searchtype=author&amp;query=Pei%2C+Y">Yajing Pei</a>, <a href="/search/eess?searchtype=author&amp;query=Lu%2C+Y">Yiting Lu</a>, <a href="/search/eess?searchtype=author&amp;query=Sun%2C+M">Ming Sun</a>, <a href="/search/eess?searchtype=author&amp;query=Zhou%2C+C">Chao Zhou</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+Z">Zhibo Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Timofte%2C+R">Radu Timofte</a>, <a href="/search/eess?searchtype=author&amp;query=Sun%2C+W">Wei Sun</a>, <a href="/search/eess?searchtype=author&amp;query=Wu%2C+H">Haoning Wu</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+Z">Zicheng Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Jia%2C+J">Jun Jia</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+Z">Zhichao Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Cao%2C+L">Linhan Cao</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+Q">Qiubo Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Min%2C+X">Xiongkuo Min</a>, <a href="/search/eess?searchtype=author&amp;query=Lin%2C+W">Weisi Lin</a>, <a href="/search/eess?searchtype=author&amp;query=Zhai%2C+G">Guangtao Zhai</a>, <a href="/search/eess?searchtype=author&amp;query=Sun%2C+J">Jianhui Sun</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+T">Tianyi Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+L">Lei Li</a>, <a href="/search/eess?searchtype=author&amp;query=Kong%2C+H">Han Kong</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+W">Wenxuan Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+B">Bing Li</a>, <a href="/search/eess?searchtype=author&amp;query=Luo%2C+C">Cheng Luo</a> , et al. (43 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.11313v1-abstract-short" style="display: inline;"> This paper reviews the NTIRE 2024 Challenge on Shortform UGC Video Quality Assessment (S-UGC VQA), where various excellent solutions are submitted and evaluated on the collected dataset KVQ from popular short-form video platform, i.e., Kuaishou/Kwai Platform. The KVQ database is divided into three parts, including 2926 videos for training, 420 videos for validation, and 854 videos for testing. The&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.11313v1-abstract-full').style.display = 'inline'; document.getElementById('2404.11313v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.11313v1-abstract-full" style="display: none;"> This paper reviews the NTIRE 2024 Challenge on Shortform UGC Video Quality Assessment (S-UGC VQA), where various excellent solutions are submitted and evaluated on the collected dataset KVQ from popular short-form video platform, i.e., Kuaishou/Kwai Platform. The KVQ database is divided into three parts, including 2926 videos for training, 420 videos for validation, and 854 videos for testing. The purpose is to build new benchmarks and advance the development of S-UGC VQA. The competition had 200 participants and 13 teams submitted valid solutions for the final testing phase. The proposed solutions achieved state-of-the-art performances for S-UGC VQA. The project can be found at https://github.com/lixinustc/KVQChallenge-CVPR-NTIRE2024. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.11313v1-abstract-full').style.display = 'none'; document.getElementById('2404.11313v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by CVPR2024 Workshop. The challenge report for CVPR NTIRE2024 Short-form UGC Video Quality Assessment Challenge</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.10343">arXiv:2404.10343</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2404.10343">pdf</a>, <a href="https://arxiv.org/format/2404.10343">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> The Ninth NTIRE 2024 Efficient Super-Resolution Challenge Report </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Ren%2C+B">Bin Ren</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+Y">Yawei Li</a>, <a href="/search/eess?searchtype=author&amp;query=Mehta%2C+N">Nancy Mehta</a>, <a href="/search/eess?searchtype=author&amp;query=Timofte%2C+R">Radu Timofte</a>, <a href="/search/eess?searchtype=author&amp;query=Yu%2C+H">Hongyuan Yu</a>, <a href="/search/eess?searchtype=author&amp;query=Wan%2C+C">Cheng Wan</a>, <a href="/search/eess?searchtype=author&amp;query=Hong%2C+Y">Yuxin Hong</a>, <a href="/search/eess?searchtype=author&amp;query=Han%2C+B">Bingnan Han</a>, <a href="/search/eess?searchtype=author&amp;query=Wu%2C+Z">Zhuoyuan Wu</a>, <a href="/search/eess?searchtype=author&amp;query=Zou%2C+Y">Yajun Zou</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+Y">Yuqing Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+J">Jizhe Li</a>, <a href="/search/eess?searchtype=author&amp;query=He%2C+K">Keji He</a>, <a href="/search/eess?searchtype=author&amp;query=Fan%2C+C">Chao Fan</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+H">Heng Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+X">Xiaolin Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Yin%2C+X">Xuanwu Yin</a>, <a href="/search/eess?searchtype=author&amp;query=Zuo%2C+K">Kunlong Zuo</a>, <a href="/search/eess?searchtype=author&amp;query=Liao%2C+B">Bohao Liao</a>, <a href="/search/eess?searchtype=author&amp;query=Xia%2C+P">Peizhe Xia</a>, <a href="/search/eess?searchtype=author&amp;query=Peng%2C+L">Long Peng</a>, <a href="/search/eess?searchtype=author&amp;query=Du%2C+Z">Zhibo Du</a>, <a href="/search/eess?searchtype=author&amp;query=Di%2C+X">Xin Di</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+W">Wangkai Li</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+Y">Yang Wang</a> , et al. (109 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.10343v2-abstract-short" style="display: inline;"> This paper provides a comprehensive review of the NTIRE 2024 challenge, focusing on efficient single-image super-resolution (ESR) solutions and their outcomes. The task of this challenge is to super-resolve an input image with a magnification factor of x4 based on pairs of low and corresponding high-resolution images. The primary objective is to develop networks that optimize various aspects such&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.10343v2-abstract-full').style.display = 'inline'; document.getElementById('2404.10343v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.10343v2-abstract-full" style="display: none;"> This paper provides a comprehensive review of the NTIRE 2024 challenge, focusing on efficient single-image super-resolution (ESR) solutions and their outcomes. The task of this challenge is to super-resolve an input image with a magnification factor of x4 based on pairs of low and corresponding high-resolution images. The primary objective is to develop networks that optimize various aspects such as runtime, parameters, and FLOPs, while still maintaining a peak signal-to-noise ratio (PSNR) of approximately 26.90 dB on the DIV2K_LSDIR_valid dataset and 26.99 dB on the DIV2K_LSDIR_test dataset. In addition, this challenge has 4 tracks including the main track (overall performance), sub-track 1 (runtime), sub-track 2 (FLOPs), and sub-track 3 (parameters). In the main track, all three metrics (ie runtime, FLOPs, and parameter count) were considered. The ranking of the main track is calculated based on a weighted sum-up of the scores of all other sub-tracks. In sub-track 1, the practical runtime performance of the submissions was evaluated, and the corresponding score was used to determine the ranking. In sub-track 2, the number of FLOPs was considered. The score calculated based on the corresponding FLOPs was used to determine the ranking. In sub-track 3, the number of parameters was considered. The score calculated based on the corresponding parameters was used to determine the ranking. RLFN is set as the baseline for efficiency measurement. The challenge had 262 registered participants, and 34 teams made valid submissions. They gauge the state-of-the-art in efficient single-image super-resolution. To facilitate the reproducibility of the challenge and enable other researchers to build upon these findings, the code and the pre-trained model of validated solutions are made publicly available at https://github.com/Amazingren/NTIRE2024_ESR/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.10343v2-abstract-full').style.display = 'none'; document.getElementById('2404.10343v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The report paper of NTIRE2024 Efficient Super-resolution, accepted by CVPRW2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.04295">arXiv:2404.04295</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2404.04295">pdf</a>, <a href="https://arxiv.org/format/2404.04295">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Transducers with Pronunciation-aware Embeddings for Automatic Speech Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Xu%2C+H">Hainan Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+Z">Zhehuai Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Jia%2C+F">Fei Jia</a>, <a href="/search/eess?searchtype=author&amp;query=Ginsburg%2C+B">Boris Ginsburg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.04295v1-abstract-short" style="display: inline;"> This paper proposes Transducers with Pronunciation-aware Embeddings (PET). Unlike conventional Transducers where the decoder embeddings for different tokens are trained independently, the PET model&#39;s decoder embedding incorporates shared components for text tokens with the same or similar pronunciations. With experiments conducted in multiple datasets in Mandarin Chinese and Korean, we show that P&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.04295v1-abstract-full').style.display = 'inline'; document.getElementById('2404.04295v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.04295v1-abstract-full" style="display: none;"> This paper proposes Transducers with Pronunciation-aware Embeddings (PET). Unlike conventional Transducers where the decoder embeddings for different tokens are trained independently, the PET model&#39;s decoder embedding incorporates shared components for text tokens with the same or similar pronunciations. With experiments conducted in multiple datasets in Mandarin Chinese and Korean, we show that PET models consistently improve speech recognition accuracy compared to conventional Transducers. Our investigation also uncovers a phenomenon that we call error chain reactions. Instead of recognition errors being evenly spread throughout an utterance, they tend to group together, with subsequent errors often following earlier ones. Our analysis shows that PET models effectively mitigate this issue by substantially reducing the likelihood of the model generating additional errors following a prior one. Our implementation will be open-sourced with the NeMo toolkit. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.04295v1-abstract-full').style.display = 'none'; document.getElementById('2404.04295v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted at the ICASSP 2024 conference</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.02384">arXiv:2404.02384</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2404.02384">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Inline AI: Open-source Deep Learning Inference for Cardiac MR </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Xue%2C+H">Hui Xue</a>, <a href="/search/eess?searchtype=author&amp;query=Davies%2C+R+H">Rhodri H Davies</a>, <a href="/search/eess?searchtype=author&amp;query=Howard%2C+J">James Howard</a>, <a href="/search/eess?searchtype=author&amp;query=Shiwani%2C+H">Hunain Shiwani</a>, <a href="/search/eess?searchtype=author&amp;query=Rehman%2C+A">Azaan Rehman</a>, <a href="/search/eess?searchtype=author&amp;query=Pierce%2C+I">Iain Pierce</a>, <a href="/search/eess?searchtype=author&amp;query=Procter%2C+H">Henry Procter</a>, <a href="/search/eess?searchtype=author&amp;query=Fontana%2C+M">Marianna Fontana</a>, <a href="/search/eess?searchtype=author&amp;query=Moon%2C+J+C">James C Moon</a>, <a href="/search/eess?searchtype=author&amp;query=Levelt%2C+E">Eylem Levelt</a>, <a href="/search/eess?searchtype=author&amp;query=Kellman%2C+P">Peter Kellman</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.02384v1-abstract-short" style="display: inline;"> Cardiac Magnetic Resonance (CMR) is established as a non-invasive imaging technique for evaluation of heart function, anatomy, and myocardial tissue characterization. Quantitative biomarkers are central for diagnosis and management of heart disease. Deep learning (DL) is playing an ever more important role in extracting these quantitative measures from CMR images. While many researchers have repor&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.02384v1-abstract-full').style.display = 'inline'; document.getElementById('2404.02384v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.02384v1-abstract-full" style="display: none;"> Cardiac Magnetic Resonance (CMR) is established as a non-invasive imaging technique for evaluation of heart function, anatomy, and myocardial tissue characterization. Quantitative biomarkers are central for diagnosis and management of heart disease. Deep learning (DL) is playing an ever more important role in extracting these quantitative measures from CMR images. While many researchers have reported promising results in training and evaluating models, model deployment into the imaging workflow is less explored. A new imaging AI framework, the InlineAI, was developed and open-sourced. The main innovation is to enable the model inference inline as a part of imaging computation, instead of as an offline post-processing step and to allow users to plug in their models. We demonstrate the system capability on three applications: long-axis CMR cine landmark detection, short-axis CMR cine analysis of function and anatomy, and quantitative perfusion mapping. The InlineAI allowed models to be deployed into imaging workflow in a streaming manner directly on the scanner. The model was loaded and inference on incoming images were performed while the data acquisition was ongoing, and results were sent back to scanner. Several biomarkers were extracted from model outputs in the demonstrated applications and reported as curves and tabular values. All processes are full automated. the model inference was completed within 6-45s after the end of imaging data acquisition. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.02384v1-abstract-full').style.display = 'none'; document.getElementById('2404.02384v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.02382">arXiv:2404.02382</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2404.02382">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Imaging transformer for MRI denoising with the SNR unit training: enabling generalization across field-strengths, imaging contrasts, and anatomy </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Xue%2C+H">Hui Xue</a>, <a href="/search/eess?searchtype=author&amp;query=Hooper%2C+S">Sarah Hooper</a>, <a href="/search/eess?searchtype=author&amp;query=Rehman%2C+A">Azaan Rehman</a>, <a href="/search/eess?searchtype=author&amp;query=Pierce%2C+I">Iain Pierce</a>, <a href="/search/eess?searchtype=author&amp;query=Treibel%2C+T">Thomas Treibel</a>, <a href="/search/eess?searchtype=author&amp;query=Davies%2C+R">Rhodri Davies</a>, <a href="/search/eess?searchtype=author&amp;query=Bandettini%2C+W+P">W Patricia Bandettini</a>, <a href="/search/eess?searchtype=author&amp;query=Ramasawmy%2C+R">Rajiv Ramasawmy</a>, <a href="/search/eess?searchtype=author&amp;query=Javed%2C+A">Ahsan Javed</a>, <a href="/search/eess?searchtype=author&amp;query=Zhu%2C+Z">Zheren Zhu</a>, <a href="/search/eess?searchtype=author&amp;query=Yang%2C+Y">Yang Yang</a>, <a href="/search/eess?searchtype=author&amp;query=Moon%2C+J">James Moon</a>, <a href="/search/eess?searchtype=author&amp;query=Campbell%2C+A">Adrienne Campbell</a>, <a href="/search/eess?searchtype=author&amp;query=Kellman%2C+P">Peter Kellman</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.02382v1-abstract-short" style="display: inline;"> The ability to recover MRI signal from noise is key to achieve fast acquisition, accurate quantification, and high image quality. Past work has shown convolutional neural networks can be used with abundant and paired low and high-SNR images for training. However, for applications where high-SNR data is difficult to produce at scale (e.g. with aggressive acceleration, high resolution, or low field&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.02382v1-abstract-full').style.display = 'inline'; document.getElementById('2404.02382v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.02382v1-abstract-full" style="display: none;"> The ability to recover MRI signal from noise is key to achieve fast acquisition, accurate quantification, and high image quality. Past work has shown convolutional neural networks can be used with abundant and paired low and high-SNR images for training. However, for applications where high-SNR data is difficult to produce at scale (e.g. with aggressive acceleration, high resolution, or low field strength), training a new denoising network using a large quantity of high-SNR images can be infeasible. In this study, we overcome this limitation by improving the generalization of denoising models, enabling application to many settings beyond what appears in the training data. Specifically, we a) develop a training scheme that uses complex MRIs reconstructed in the SNR units (i.e., the images have a fixed noise level, SNR unit training) and augments images with realistic noise based on coil g-factor, and b) develop a novel imaging transformer (imformer) to handle 2D, 2D+T, and 3D MRIs in one model architecture. Through empirical evaluation, we show this combination improves performance compared to CNN models and improves generalization, enabling a denoising model to be used across field-strengths, image contrasts, and anatomy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.02382v1-abstract-full').style.display = 'none'; document.getElementById('2404.02382v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.19983">arXiv:2403.19983</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2403.19983">pdf</a>, <a href="https://arxiv.org/format/2403.19983">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> A multi-stage semi-supervised learning for ankle fracture classification on CT images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Liu%2C+H">Hongzhi Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+G">Guicheng Li</a>, <a href="/search/eess?searchtype=author&amp;query=Nie%2C+J">Jiacheng Nie</a>, <a href="/search/eess?searchtype=author&amp;query=Tang%2C+H">Hui Tang</a>, <a href="/search/eess?searchtype=author&amp;query=Yang%2C+C">Chunfeng Yang</a>, <a href="/search/eess?searchtype=author&amp;query=Feng%2C+Q">Qianjin Feng</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+H">Hailin Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+Y">Yang Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.19983v1-abstract-short" style="display: inline;"> Because of the complicated mechanism of ankle injury, it is very difficult to diagnose ankle fracture in clinic. In order to simplify the process of fracture diagnosis, an automatic diagnosis model of ankle fracture was proposed. Firstly, a tibia-fibula segmentation network is proposed for the joint tibiofibular region of the ankle joint, and the corresponding segmentation dataset is established o&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.19983v1-abstract-full').style.display = 'inline'; document.getElementById('2403.19983v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.19983v1-abstract-full" style="display: none;"> Because of the complicated mechanism of ankle injury, it is very difficult to diagnose ankle fracture in clinic. In order to simplify the process of fracture diagnosis, an automatic diagnosis model of ankle fracture was proposed. Firstly, a tibia-fibula segmentation network is proposed for the joint tibiofibular region of the ankle joint, and the corresponding segmentation dataset is established on the basis of fracture data. Secondly, the image registration method is used to register the bone segmentation mask with the normal bone mask. Finally, a semi-supervised classifier is constructed to make full use of a large number of unlabeled data to classify ankle fractures. Experiments show that the proposed method can segment fractures with fracture lines accurately and has better performance than the general method. At the same time, this method is superior to classification network in several indexes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.19983v1-abstract-full').style.display = 'none'; document.getElementById('2403.19983v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Xu%2C+H&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Xu%2C+H&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Xu%2C+H&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Xu%2C+H&amp;start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Xu%2C+H&amp;start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Xu%2C+H&amp;start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

Pages: 1 2 3 4 5 6 7 8 9 10