CINXE.COM

Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1&ndash;50 of 538 results for author: <span class="mathjax">Li, C</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> <div class="content"> <form method="GET" action="/search/eess" aria-role="search"> Searching in archive <strong>eess</strong>. <a href="/search/?searchtype=author&amp;query=Li%2C+C">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Li, C"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Li%2C+C&amp;terms-0-field=author&amp;size=50&amp;order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Li, C"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Li%2C+C&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Li%2C+C&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Li%2C+C&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Li%2C+C&amp;start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Li%2C+C&amp;start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Li%2C+C&amp;start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">&hellip;</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09940">arXiv:2502.09940</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.09940">pdf</a>, <a href="https://arxiv.org/format/2502.09940">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> A Preliminary Exploration with GPT-4o Voice Mode </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Lin%2C+Y">Yu-Xiang Lin</a>, <a href="/search/eess?searchtype=author&amp;query=Yang%2C+C">Chih-Kai Yang</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+W">Wei-Chih Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+C">Chen-An Li</a>, <a href="/search/eess?searchtype=author&amp;query=Huang%2C+C">Chien-yu Huang</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+X">Xuanjun Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Lee%2C+H">Hung-yi Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09940v1-abstract-short" style="display: inline;"> With the rise of multimodal large language models, GPT-4o stands out as a pioneering model, driving us to evaluate its capabilities. This report assesses GPT-4o across various tasks to analyze its audio processing and reasoning abilities. We find that GPT-4o exhibits strong knowledge in audio, speech, and music understanding, performing well in tasks like intent classification, spoken command clas&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09940v1-abstract-full').style.display = 'inline'; document.getElementById('2502.09940v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09940v1-abstract-full" style="display: none;"> With the rise of multimodal large language models, GPT-4o stands out as a pioneering model, driving us to evaluate its capabilities. This report assesses GPT-4o across various tasks to analyze its audio processing and reasoning abilities. We find that GPT-4o exhibits strong knowledge in audio, speech, and music understanding, performing well in tasks like intent classification, spoken command classification, semantic and grammatical reasoning., multilingual speech recognition, and singing analysis. It also shows greater robustness against hallucinations than other large audio-language models (LALMs). However, it struggles with tasks such as audio duration prediction and instrument classification. Additionally, GPT-4o&#39;s safety mechanisms cause it to decline tasks like speaker identification, age classification, MOS prediction, and audio deepfake detection. Notably, the model exhibits a significantly different refusal rate when responding to speaker verification tasks on different datasets. This is likely due to variations in the accompanying instructions or the quality of the input audio, suggesting the sensitivity of its built-in safeguards. Finally, we acknowledge that model performance varies with evaluation protocols. This report only serves as a preliminary exploration of the current state of LALMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09940v1-abstract-full').style.display = 'none'; document.getElementById('2502.09940v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Work in progress</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09662">arXiv:2502.09662</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.09662">pdf</a>, <a href="https://arxiv.org/format/2502.09662">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Generalizable Cervical Cancer Screening via Large-scale Pretraining and Test-Time Adaptation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Jiang%2C+H">Hao Jiang</a>, <a href="/search/eess?searchtype=author&amp;query=Jin%2C+C">Cheng Jin</a>, <a href="/search/eess?searchtype=author&amp;query=Lin%2C+H">Huangjing Lin</a>, <a href="/search/eess?searchtype=author&amp;query=Zhou%2C+Y">Yanning Zhou</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+X">Xi Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Ma%2C+J">Jiabo Ma</a>, <a href="/search/eess?searchtype=author&amp;query=Ding%2C+L">Li Ding</a>, <a href="/search/eess?searchtype=author&amp;query=Hou%2C+J">Jun Hou</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+R">Runsheng Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Chai%2C+Z">Zhizhong Chai</a>, <a href="/search/eess?searchtype=author&amp;query=Luo%2C+L">Luyang Luo</a>, <a href="/search/eess?searchtype=author&amp;query=Shi%2C+H">Huijuan Shi</a>, <a href="/search/eess?searchtype=author&amp;query=Qian%2C+Y">Yinling Qian</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+Q">Qiong Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+C">Changzhong Li</a>, <a href="/search/eess?searchtype=author&amp;query=Han%2C+A">Anjia Han</a>, <a href="/search/eess?searchtype=author&amp;query=Chan%2C+R+C+K">Ronald Cheong Kin Chan</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+H">Hao Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09662v1-abstract-short" style="display: inline;"> Cervical cancer is a leading malignancy in female reproductive system. While AI-assisted cytology offers a cost-effective and non-invasive screening solution, current systems struggle with generalizability in complex clinical scenarios. To address this issue, we introduced Smart-CCS, a generalizable Cervical Cancer Screening paradigm based on pretraining and adaptation to create robust and general&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09662v1-abstract-full').style.display = 'inline'; document.getElementById('2502.09662v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09662v1-abstract-full" style="display: none;"> Cervical cancer is a leading malignancy in female reproductive system. While AI-assisted cytology offers a cost-effective and non-invasive screening solution, current systems struggle with generalizability in complex clinical scenarios. To address this issue, we introduced Smart-CCS, a generalizable Cervical Cancer Screening paradigm based on pretraining and adaptation to create robust and generalizable screening systems. To develop and validate Smart-CCS, we first curated a large-scale, multi-center dataset named CCS-127K, which comprises a total of 127,471 cervical cytology whole-slide images collected from 48 medical centers. By leveraging large-scale self-supervised pretraining, our CCS models are equipped with strong generalization capability, potentially generalizing across diverse scenarios. Then, we incorporated test-time adaptation to specifically optimize the trained CCS model for complex clinical settings, which adapts and refines predictions, improving real-world applicability. We conducted large-scale system evaluation among various cohorts. In retrospective cohorts, Smart-CCS achieved an overall area under the curve (AUC) value of 0.965 and sensitivity of 0.913 for cancer screening on 11 internal test datasets. In external testing, system performance maintained high at 0.950 AUC across 6 independent test datasets. In prospective cohorts, our Smart-CCS achieved AUCs of 0.947, 0.924, and 0.986 in three prospective centers, respectively. Moreover, the system demonstrated superior sensitivity in diagnosing cervical cancer, confirming the accuracy of our cancer screening results by using histology findings for validation. Interpretability analysis with cell and slide predictions further indicated that the system&#39;s decision-making aligns with clinical practice. Smart-CCS represents a significant advancement in cancer screening across diverse clinical contexts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09662v1-abstract-full').style.display = 'none'; document.getElementById('2502.09662v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06156">arXiv:2502.06156</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.06156">pdf</a>, <a href="https://arxiv.org/ps/2502.06156">ps</a>, <a href="https://arxiv.org/format/2502.06156">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="High Energy Physics - Phenomenology">hep-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Axial current as the origin of quantum intrinsic orbital angular momentum </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Amat%2C+O">Orkash Amat</a>, <a href="/search/eess?searchtype=author&amp;query=Nurmamat%2C+N">Nurimangul Nurmamat</a>, <a href="/search/eess?searchtype=author&amp;query=Huang%2C+Y">Yong-Feng Huang</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+C">Cheng-Ming Li</a>, <a href="/search/eess?searchtype=author&amp;query=Geng%2C+J">Jin-Jun Geng</a>, <a href="/search/eess?searchtype=author&amp;query=Hu%2C+C">Chen-Ran Hu</a>, <a href="/search/eess?searchtype=author&amp;query=Zou%2C+Z">Ze-Cheng Zou</a>, <a href="/search/eess?searchtype=author&amp;query=Dong%2C+X">Xiao-Fei Dong</a>, <a href="/search/eess?searchtype=author&amp;query=Deng%2C+C">Chen Deng</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+F">Fan Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+X">Xiao-li Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Du%2C+C">Chen Du</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06156v1-abstract-short" style="display: inline;"> We show that it is impossible to experimentally observe the quantum intrinsic orbital angular momentum (IOAM) effect without its axial current. Broadly speaking, we argue that the spiral or interference characteristics of the axial current density determine the occurrence of nonlinear or tunneling effects in any spacetimedependent quantum systems. Our findings offer a comprehensive theoretical fra&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06156v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06156v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06156v1-abstract-full" style="display: none;"> We show that it is impossible to experimentally observe the quantum intrinsic orbital angular momentum (IOAM) effect without its axial current. Broadly speaking, we argue that the spiral or interference characteristics of the axial current density determine the occurrence of nonlinear or tunneling effects in any spacetimedependent quantum systems. Our findings offer a comprehensive theoretical framework that addresses the limitations of Keldysh theory and provides new insights into the angular momentum properties of quantum systems, particularly in tunneling-dominated regimes. Using Wigner function methods, fermionic generalized two-level model, and Berry phase simulations, we predict that IOAM effect can persist even in pure quantum tunneling processes. These results open the door for experimental verification of IOAM effects in future high-intensity QED experiments, such as those using X-ray free electron lasers. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06156v1-abstract-full').style.display = 'none'; document.getElementById('2502.06156v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 2 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05130">arXiv:2502.05130</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.05130">pdf</a>, <a href="https://arxiv.org/format/2502.05130">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Latent Swap Joint Diffusion for Long-Form Audio Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Dai%2C+Y">Yusheng Dai</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+C">Chenxi Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+C">Chang Li</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+C">Chen Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Du%2C+J">Jun Du</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+K">Kewei Li</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+R">Ruoyu Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Ma%2C+J">Jiefeng Ma</a>, <a href="/search/eess?searchtype=author&amp;query=Sun%2C+L">Lei Sun</a>, <a href="/search/eess?searchtype=author&amp;query=Gao%2C+J">Jianqing Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05130v1-abstract-short" style="display: inline;"> Previous work on long-form audio generation using global-view diffusion or iterative generation demands significant training or inference costs. While recent advancements in multi-view joint diffusion for panoramic generation provide an efficient option, they struggle with spectrum generation with severe overlap distortions and high cross-view consistency costs. We initially explore this phenomeno&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05130v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05130v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05130v1-abstract-full" style="display: none;"> Previous work on long-form audio generation using global-view diffusion or iterative generation demands significant training or inference costs. While recent advancements in multi-view joint diffusion for panoramic generation provide an efficient option, they struggle with spectrum generation with severe overlap distortions and high cross-view consistency costs. We initially explore this phenomenon through the connectivity inheritance of latent maps and uncover that averaging operations excessively smooth the high-frequency components of the latent map. To address these issues, we propose Swap Forward (SaFa), a frame-level latent swap framework that synchronizes multiple diffusions to produce a globally coherent long audio with more spectrum details in a forward-only manner. At its core, the bidirectional Self-Loop Latent Swap is applied between adjacent views, leveraging stepwise diffusion trajectory to adaptively enhance high-frequency components without disrupting low-frequency components. Furthermore, to ensure cross-view consistency, the unidirectional Reference-Guided Latent Swap is applied between the reference and the non-overlap regions of each subview during the early stages, providing centralized trajectory guidance. Quantitative and qualitative experiments demonstrate that SaFa significantly outperforms existing joint diffusion methods and even training-based long audio generation models. Moreover, we find that it also adapts well to panoramic generation, achieving comparable state-of-the-art performance with greater efficiency and model generalizability. Project page is available at https://swapforward.github.io/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05130v1-abstract-full').style.display = 'none'; document.getElementById('2502.05130v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.03930">arXiv:2502.03930</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.03930">pdf</a>, <a href="https://arxiv.org/format/2502.03930">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> DiTAR: Diffusion Transformer Autoregressive Modeling for Speech Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Jia%2C+D">Dongya Jia</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+Z">Zhuo Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+J">Jiawei Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Du%2C+C">Chenpeng Du</a>, <a href="/search/eess?searchtype=author&amp;query=Wu%2C+J">Jian Wu</a>, <a href="/search/eess?searchtype=author&amp;query=Cong%2C+J">Jian Cong</a>, <a href="/search/eess?searchtype=author&amp;query=Zhuang%2C+X">Xiaobin Zhuang</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+C">Chumin Li</a>, <a href="/search/eess?searchtype=author&amp;query=Wei%2C+Z">Zhen Wei</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+Y">Yuping Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+Y">Yuxuan Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.03930v2-abstract-short" style="display: inline;"> Several recent studies have attempted to autoregressively generate continuous speech representations without discrete speech tokens by combining diffusion and autoregressive models, yet they often face challenges with excessive computational loads or suboptimal outcomes. In this work, we propose Diffusion Transformer Autoregressive Modeling (DiTAR), a patch-based autoregressive framework combining&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03930v2-abstract-full').style.display = 'inline'; document.getElementById('2502.03930v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.03930v2-abstract-full" style="display: none;"> Several recent studies have attempted to autoregressively generate continuous speech representations without discrete speech tokens by combining diffusion and autoregressive models, yet they often face challenges with excessive computational loads or suboptimal outcomes. In this work, we propose Diffusion Transformer Autoregressive Modeling (DiTAR), a patch-based autoregressive framework combining a language model with a diffusion transformer. This approach significantly enhances the efficacy of autoregressive models for continuous tokens and reduces computational demands. DiTAR utilizes a divide-and-conquer strategy for patch generation, where the language model processes aggregated patch embeddings and the diffusion transformer subsequently generates the next patch based on the output of the language model. For inference, we propose defining temperature as the time point of introducing noise during the reverse diffusion ODE to balance diversity and determinism. We also show in the extensive scaling analysis that DiTAR has superb scalability. In zero-shot speech generation, DiTAR achieves state-of-the-art performance in robustness, speaker similarity, and naturalness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03930v2-abstract-full').style.display = 'none'; document.getElementById('2502.03930v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages, 8 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.00712">arXiv:2502.00712</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.00712">pdf</a>, <a href="https://arxiv.org/format/2502.00712">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Registration-Enhanced Segmentation Method for Prostate Cancer in Ultrasound Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Sang%2C+S">Shengtian Sang</a>, <a href="/search/eess?searchtype=author&amp;query=Jahanandish%2C+H">Hassan Jahanandish</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+C+X">Cynthia Xinran Li</a>, <a href="/search/eess?searchtype=author&amp;query=Bhattachary%2C+I">Indrani Bhattachary</a>, <a href="/search/eess?searchtype=author&amp;query=Lee%2C+J+H">Jeong Hoon Lee</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+L">Lichun Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Vesal%2C+S">Sulaiman Vesal</a>, <a href="/search/eess?searchtype=author&amp;query=Ghanouni%2C+P">Pejman Ghanouni</a>, <a href="/search/eess?searchtype=author&amp;query=Fan%2C+R">Richard Fan</a>, <a href="/search/eess?searchtype=author&amp;query=Sonn%2C+G+A">Geoffrey A. Sonn</a>, <a href="/search/eess?searchtype=author&amp;query=Rusu%2C+M">Mirabela Rusu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.00712v1-abstract-short" style="display: inline;"> Prostate cancer is a major cause of cancer-related deaths in men, where early detection greatly improves survival rates. Although MRI-TRUS fusion biopsy offers superior accuracy by combining MRI&#39;s detailed visualization with TRUS&#39;s real-time guidance, it is a complex and time-intensive procedure that relies heavily on manual annotations, leading to potential errors. To address these challenges, we&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00712v1-abstract-full').style.display = 'inline'; document.getElementById('2502.00712v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.00712v1-abstract-full" style="display: none;"> Prostate cancer is a major cause of cancer-related deaths in men, where early detection greatly improves survival rates. Although MRI-TRUS fusion biopsy offers superior accuracy by combining MRI&#39;s detailed visualization with TRUS&#39;s real-time guidance, it is a complex and time-intensive procedure that relies heavily on manual annotations, leading to potential errors. To address these challenges, we propose a fully automatic MRI-TRUS fusion-based segmentation method that identifies prostate tumors directly in TRUS images without requiring manual annotations. Unlike traditional multimodal fusion approaches that rely on naive data concatenation, our method integrates a registration-segmentation framework to align and leverage spatial information between MRI and TRUS modalities. This alignment enhances segmentation accuracy and reduces reliance on manual effort. Our approach was validated on a dataset of 1,747 patients from Stanford Hospital, achieving an average Dice coefficient of 0.212, outperforming TRUS-only (0.117) and naive MRI-TRUS fusion (0.132) methods, with significant improvements (p $&lt;$ 0.01). This framework demonstrates the potential for reducing the complexity of prostate cancer diagnosis and provides a flexible architecture applicable to other multimodal medical imaging tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00712v1-abstract-full').style.display = 'none'; document.getElementById('2502.00712v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.00404">arXiv:2502.00404</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.00404">pdf</a>, <a href="https://arxiv.org/format/2502.00404">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Exploring Linear Attention Alternative for Single Image Super-Resolution </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Lu%2C+R">Rongchang Lu</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+C">Changyu Li</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+D">Donghang Li</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+G">Guojing Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Huang%2C+J">Jianqiang Huang</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+X">Xilai Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.00404v1-abstract-short" style="display: inline;"> Deep learning-based single-image super-resolution (SISR) technology focuses on enhancing low-resolution (LR) images into high-resolution (HR) ones. Although significant progress has been made, challenges remain in computational complexity and quality, particularly in remote sensing image processing. To address these issues, we propose our Omni-Scale RWKV Super-Resolution (OmniRWKVSR) model which p&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00404v1-abstract-full').style.display = 'inline'; document.getElementById('2502.00404v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.00404v1-abstract-full" style="display: none;"> Deep learning-based single-image super-resolution (SISR) technology focuses on enhancing low-resolution (LR) images into high-resolution (HR) ones. Although significant progress has been made, challenges remain in computational complexity and quality, particularly in remote sensing image processing. To address these issues, we propose our Omni-Scale RWKV Super-Resolution (OmniRWKVSR) model which presents a novel approach that combines the Receptance Weighted Key Value (RWKV) architecture with feature extraction techniques such as Visual RWKV Spatial Mixing (VRSM) and Visual RWKV Channel Mixing (VRCM), aiming to overcome the limitations of existing methods and achieve superior SISR performance. This work has proved able to provide effective solutions for high-quality image reconstruction. Under the 4x Super-Resolution tasks, compared to the MambaIR model, we achieved an average improvement of 0.26% in PSNR and 0.16% in SSIM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00404v1-abstract-full').style.display = 'none'; document.getElementById('2502.00404v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper has been published to IEEE International Joint Conference on Neural Networks. Feel free to contact on nomodeset@qq.com</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.4.9 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.00366">arXiv:2502.00366</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.00366">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Prostate-Specific Foundation Models for Enhanced Detection of Clinically Significant Cancer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Lee%2C+J+H">Jeong Hoon Lee</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+C+X">Cynthia Xinran Li</a>, <a href="/search/eess?searchtype=author&amp;query=Jahanandish%2C+H">Hassan Jahanandish</a>, <a href="/search/eess?searchtype=author&amp;query=Bhattacharya%2C+I">Indrani Bhattacharya</a>, <a href="/search/eess?searchtype=author&amp;query=Vesal%2C+S">Sulaiman Vesal</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+L">Lichun Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Sang%2C+S">Shengtian Sang</a>, <a href="/search/eess?searchtype=author&amp;query=Choi%2C+M+H">Moon Hyung Choi</a>, <a href="/search/eess?searchtype=author&amp;query=Soerensen%2C+S+J+C">Simon John Christoph Soerensen</a>, <a href="/search/eess?searchtype=author&amp;query=Zhou%2C+S+R">Steve Ran Zhou</a>, <a href="/search/eess?searchtype=author&amp;query=Sommer%2C+E+R">Elijah Richard Sommer</a>, <a href="/search/eess?searchtype=author&amp;query=Fan%2C+R">Richard Fan</a>, <a href="/search/eess?searchtype=author&amp;query=Ghanouni%2C+P">Pejman Ghanouni</a>, <a href="/search/eess?searchtype=author&amp;query=Song%2C+Y">Yuze Song</a>, <a href="/search/eess?searchtype=author&amp;query=Seibert%2C+T+M">Tyler M. Seibert</a>, <a href="/search/eess?searchtype=author&amp;query=Sonn%2C+G+A">Geoffrey A. Sonn</a>, <a href="/search/eess?searchtype=author&amp;query=Rusu%2C+M">Mirabela Rusu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.00366v2-abstract-short" style="display: inline;"> Accurate prostate cancer diagnosis remains challenging. Even when using MRI, radiologists exhibit low specificity and significant inter-observer variability, leading to potential delays or inaccuracies in identifying clinically significant cancers. This leads to numerous unnecessary biopsies and risks of missing clinically significant cancers. Here we present prostate vision contrastive network (P&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00366v2-abstract-full').style.display = 'inline'; document.getElementById('2502.00366v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.00366v2-abstract-full" style="display: none;"> Accurate prostate cancer diagnosis remains challenging. Even when using MRI, radiologists exhibit low specificity and significant inter-observer variability, leading to potential delays or inaccuracies in identifying clinically significant cancers. This leads to numerous unnecessary biopsies and risks of missing clinically significant cancers. Here we present prostate vision contrastive network (ProViCNet), prostate organ-specific vision foundation models for Magnetic Resonance Imaging (MRI) and Trans-Rectal Ultrasound imaging (TRUS) for comprehensive cancer detection. ProViCNet was trained and validated using 4,401 patients across six institutions, as a prostate cancer detection model on radiology images relying on patch-level contrastive learning guided by biopsy confirmed radiologist annotations. ProViCNet demonstrated consistent performance across multiple internal and external validation cohorts with area under the receiver operating curve values ranging from 0.875 to 0.966, significantly outperforming radiologists in the reader study (0.907 versus 0.805, p&lt;0.001) for mpMRI, while achieving 0.670 to 0.740 for TRUS. We also integrated ProViCNet with standard PSA to develop a virtual screening test, and we showed that we can maintain the high sensitivity for detecting clinically significant cancers while more than doubling specificity from 15% to 38% (p&lt;0.001), thereby substantially reducing unnecessary biopsies. These findings highlight that ProViCNet&#39;s potential for enhancing prostate cancer diagnosis accuracy and reduce unnecessary biopsies, thereby optimizing diagnostic pathways. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00366v2-abstract-full').style.display = 'none'; document.getElementById('2502.00366v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">44pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.00146">arXiv:2502.00146</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.00146">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Multimodal MRI-Ultrasound AI for Prostate Cancer Detection Outperforms Radiologist MRI Interpretation: A Multi-Center Study </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Jahanandish%2C+H">Hassan Jahanandish</a>, <a href="/search/eess?searchtype=author&amp;query=Sang%2C+S">Shengtian Sang</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+C+X">Cynthia Xinran Li</a>, <a href="/search/eess?searchtype=author&amp;query=Vesal%2C+S">Sulaiman Vesal</a>, <a href="/search/eess?searchtype=author&amp;query=Bhattacharya%2C+I">Indrani Bhattacharya</a>, <a href="/search/eess?searchtype=author&amp;query=Lee%2C+J+H">Jeong Hoon Lee</a>, <a href="/search/eess?searchtype=author&amp;query=Fan%2C+R">Richard Fan</a>, <a href="/search/eess?searchtype=author&amp;query=Sonna%2C+G+A">Geoffrey A. Sonna</a>, <a href="/search/eess?searchtype=author&amp;query=Rusu%2C+M">Mirabela Rusu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.00146v1-abstract-short" style="display: inline;"> Pre-biopsy magnetic resonance imaging (MRI) is increasingly used to target suspicious prostate lesions. This has led to artificial intelligence (AI) applications improving MRI-based detection of clinically significant prostate cancer (CsPCa). However, MRI-detected lesions must still be mapped to transrectal ultrasound (TRUS) images during biopsy, which results in missing CsPCa. This study systemat&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00146v1-abstract-full').style.display = 'inline'; document.getElementById('2502.00146v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.00146v1-abstract-full" style="display: none;"> Pre-biopsy magnetic resonance imaging (MRI) is increasingly used to target suspicious prostate lesions. This has led to artificial intelligence (AI) applications improving MRI-based detection of clinically significant prostate cancer (CsPCa). However, MRI-detected lesions must still be mapped to transrectal ultrasound (TRUS) images during biopsy, which results in missing CsPCa. This study systematically evaluates a multimodal AI framework integrating MRI and TRUS image sequences to enhance CsPCa identification. The study included 3110 patients from three cohorts across two institutions who underwent prostate biopsy. The proposed framework, based on the 3D UNet architecture, was evaluated on 1700 test cases, comparing performance to unimodal AI models that use either MRI or TRUS alone. Additionally, the proposed model was compared to radiologists in a cohort of 110 patients. The multimodal AI approach achieved superior sensitivity (80%) and Lesion Dice (42%) compared to unimodal MRI (73%, 30%) and TRUS models (49%, 27%). Compared to radiologists, the multimodal model showed higher specificity (88% vs. 78%) and Lesion Dice (38% vs. 33%), with equivalent sensitivity (79%). Our findings demonstrate the potential of multimodal AI to improve CsPCa lesion targeting during biopsy and treatment planning, surpassing current unimodal models and radiologists; ultimately improving outcomes for prostate cancer patients. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00146v1-abstract-full').style.display = 'none'; document.getElementById('2502.00146v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.16014">arXiv:2501.16014</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2501.16014">pdf</a>, <a href="https://arxiv.org/format/2501.16014">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Spatial-Angular Representation Learning for High-Fidelity Continuous Super-Resolution in Diffusion MRI </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Wu%2C+R">Ruoyou Wu</a>, <a href="/search/eess?searchtype=author&amp;query=Cheng%2C+J">Jian Cheng</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+C">Cheng Li</a>, <a href="/search/eess?searchtype=author&amp;query=Zou%2C+J">Juan Zou</a>, <a href="/search/eess?searchtype=author&amp;query=Fan%2C+W">Wenxin Fan</a>, <a href="/search/eess?searchtype=author&amp;query=Guo%2C+H">Hua Guo</a>, <a href="/search/eess?searchtype=author&amp;query=Liang%2C+Y">Yong Liang</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+S">Shanshan Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.16014v1-abstract-short" style="display: inline;"> Diffusion magnetic resonance imaging (dMRI) often suffers from low spatial and angular resolution due to inherent limitations in imaging hardware and system noise, adversely affecting the accurate estimation of microstructural parameters with fine anatomical details. Deep learning-based super-resolution techniques have shown promise in enhancing dMRI resolution without increasing acquisition time.&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16014v1-abstract-full').style.display = 'inline'; document.getElementById('2501.16014v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.16014v1-abstract-full" style="display: none;"> Diffusion magnetic resonance imaging (dMRI) often suffers from low spatial and angular resolution due to inherent limitations in imaging hardware and system noise, adversely affecting the accurate estimation of microstructural parameters with fine anatomical details. Deep learning-based super-resolution techniques have shown promise in enhancing dMRI resolution without increasing acquisition time. However, most existing methods are confined to either spatial or angular super-resolution, limiting their effectiveness in capturing detailed microstructural features. Furthermore, traditional pixel-wise loss functions struggle to recover intricate image details essential for high-resolution reconstruction. To address these challenges, we propose SARL-dMRI, a novel Spatial-Angular Representation Learning framework for high-fidelity, continuous super-resolution in dMRI. SARL-dMRI explores implicit neural representations and spherical harmonics to model continuous spatial and angular representations, simultaneously enhancing both spatial and angular resolution while improving microstructural parameter estimation accuracy. To further preserve image fidelity, a data-fidelity module and wavelet-based frequency loss are introduced, ensuring the super-resolved images remain consistent with the original input and retain fine details. Extensive experiments demonstrate that, compared to five other state-of-the-art methods, our method significantly enhances dMRI data resolution, improves the accuracy of microstructural parameter estimation, and provides better generalization capabilities. It maintains stable performance even under a 45$\times$ downsampling factor. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16014v1-abstract-full').style.display = 'none'; document.getElementById('2501.16014v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.15368">arXiv:2501.15368</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2501.15368">pdf</a>, <a href="https://arxiv.org/format/2501.15368">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Baichuan-Omni-1.5 Technical Report </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Li%2C+Y">Yadong Li</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+J">Jun Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+T">Tao Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+T">Tao Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+S">Song Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+T">Tianpeng Li</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+Z">Zehuan Li</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+L">Lijun Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Ming%2C+L">Lingfeng Ming</a>, <a href="/search/eess?searchtype=author&amp;query=Dong%2C+G">Guosheng Dong</a>, <a href="/search/eess?searchtype=author&amp;query=Pan%2C+D">Da Pan</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+C">Chong Li</a>, <a href="/search/eess?searchtype=author&amp;query=Fang%2C+Y">Yuanbo Fang</a>, <a href="/search/eess?searchtype=author&amp;query=Kuang%2C+D">Dongdong Kuang</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+M">Mingrui Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Zhu%2C+C">Chenglin Zhu</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+Y">Youwei Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Guo%2C+H">Hongyu Guo</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+F">Fengyu Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+Y">Yuran Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Ding%2C+B">Bowen Ding</a>, <a href="/search/eess?searchtype=author&amp;query=Song%2C+W">Wei Song</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+X">Xu Li</a>, <a href="/search/eess?searchtype=author&amp;query=Huo%2C+Y">Yuqi Huo</a>, <a href="/search/eess?searchtype=author&amp;query=Liang%2C+Z">Zheng Liang</a> , et al. (68 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.15368v1-abstract-short" style="display: inline;"> We introduce Baichuan-Omni-1.5, an omni-modal model that not only has omni-modal understanding capabilities but also provides end-to-end audio generation capabilities. To achieve fluent and high-quality interaction across modalities without compromising the capabilities of any modality, we prioritized optimizing three key aspects. First, we establish a comprehensive data cleaning and synthesis pip&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15368v1-abstract-full').style.display = 'inline'; document.getElementById('2501.15368v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.15368v1-abstract-full" style="display: none;"> We introduce Baichuan-Omni-1.5, an omni-modal model that not only has omni-modal understanding capabilities but also provides end-to-end audio generation capabilities. To achieve fluent and high-quality interaction across modalities without compromising the capabilities of any modality, we prioritized optimizing three key aspects. First, we establish a comprehensive data cleaning and synthesis pipeline for multimodal data, obtaining about 500B high-quality data (text, audio, and vision). Second, an audio-tokenizer (Baichuan-Audio-Tokenizer) has been designed to capture both semantic and acoustic information from audio, enabling seamless integration and enhanced compatibility with MLLM. Lastly, we designed a multi-stage training strategy that progressively integrates multimodal alignment and multitask fine-tuning, ensuring effective synergy across all modalities. Baichuan-Omni-1.5 leads contemporary models (including GPT4o-mini and MiniCPM-o 2.6) in terms of comprehensive omni-modal capabilities. Notably, it achieves results comparable to leading models such as Qwen2-VL-72B across various multimodal medical benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15368v1-abstract-full').style.display = 'none'; document.getElementById('2501.15368v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.15264">arXiv:2501.15264</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2501.15264">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Fusion of Millimeter-wave Radar and Pulse Oximeter Data for Low-burden Diagnosis of Obstructive Sleep Apnea-Hypopnea Syndrome </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Wang%2C+W">Wei Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+Z">Zhaoxi Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+W">Wenyu Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+Z">Zetao Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Zhao%2C+X">Xiang Zhao</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+C">Chenyang Li</a>, <a href="/search/eess?searchtype=author&amp;query=Guan%2C+J">Jian Guan</a>, <a href="/search/eess?searchtype=author&amp;query=Yin%2C+S">Shankai Yin</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+G">Gang Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.15264v1-abstract-short" style="display: inline;"> Objective: The aim of the study is to develop a novel method for improved diagnosis of obstructive sleep apnea-hypopnea syndrome (OSAHS) in clinical or home settings, with the focus on achieving diagnostic performance comparable to the gold-standard polysomnography (PSG) with significantly reduced monitoring burden. Methods: We propose a method using millimeter-wave radar and pulse oximeter for OS&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15264v1-abstract-full').style.display = 'inline'; document.getElementById('2501.15264v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.15264v1-abstract-full" style="display: none;"> Objective: The aim of the study is to develop a novel method for improved diagnosis of obstructive sleep apnea-hypopnea syndrome (OSAHS) in clinical or home settings, with the focus on achieving diagnostic performance comparable to the gold-standard polysomnography (PSG) with significantly reduced monitoring burden. Methods: We propose a method using millimeter-wave radar and pulse oximeter for OSAHS diagnosis (ROSA). It contains a sleep apnea-hypopnea events (SAE) detection network, which directly predicts the temporal localization of SAE, and a sleep staging network, which predicts the sleep stages throughout the night, based on radar signals. It also fuses oxygen saturation (SpO2) information from the pulse oximeter to adjust the score of SAE detected by radar. Results: Experimental results on a real-world dataset (&gt;800 hours of overnight recordings, 100 subjects) demonstrated high agreement (ICC=0.9870) on apnea-hypopnea index (AHI) between ROSA and PSG. ROSA also exhibited excellent diagnostic performance, exceeding 90% in accuracy across AHI diagnostic thresholds of 5, 15 and 30 events/h. Conclusion: ROSA improves diagnostic accuracy by fusing millimeter-wave radar and pulse oximeter data. It provides a reliable and low-burden solution for OSAHS diagnosis. Significance: ROSA addresses the limitations of high complexity and monitoring burden associated with traditional PSG. The high accuracy and low burden of ROSA show its potential to improve the accessibility of OSAHS diagnosis among population. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15264v1-abstract-full').style.display = 'none'; document.getElementById('2501.15264v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.13972">arXiv:2501.13972</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2501.13972">pdf</a>, <a href="https://arxiv.org/format/2501.13972">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Synthetic CT image generation from CBCT: A Systematic Review </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Altalib%2C+A">Alzahra Altalib</a>, <a href="/search/eess?searchtype=author&amp;query=McGregor%2C+S">Scott McGregor</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+C">Chunhui Li</a>, <a href="/search/eess?searchtype=author&amp;query=Perelli%2C+A">Alessandro Perelli</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.13972v1-abstract-short" style="display: inline;"> The generation of synthetic CT (sCT) images from cone-beam CT (CBCT) data using deep learning methodologies represents a significant advancement in radiation oncology. This systematic review, following PRISMA guidelines and using the PICO model, comprehensively evaluates the literature from 2014 to 2024 on the generation of sCT images for radiation therapy planning in oncology. A total of 35 relev&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13972v1-abstract-full').style.display = 'inline'; document.getElementById('2501.13972v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.13972v1-abstract-full" style="display: none;"> The generation of synthetic CT (sCT) images from cone-beam CT (CBCT) data using deep learning methodologies represents a significant advancement in radiation oncology. This systematic review, following PRISMA guidelines and using the PICO model, comprehensively evaluates the literature from 2014 to 2024 on the generation of sCT images for radiation therapy planning in oncology. A total of 35 relevant studies were identified and analyzed, revealing the prevalence of deep learning approaches in the generation of sCT. This review comprehensively covers synthetic CT generation based on CBCT and proton-based studies. Some of the commonly employed architectures explored are convolutional neural networks (CNNs), generative adversarial networks (GANs), transformers, and diffusion models. Evaluation metrics including mean absolute error (MAE), root mean square error (RMSE), peak signal-to-noise ratio (PSNR) and structural similarity index (SSIM) consistently demonstrate the comparability of sCT images with gold-standard planning CTs (pCT), indicating their potential to improve treatment precision and patient outcomes. Challenges such as field-of-view (FOV) disparities and integration into clinical workflows are discussed, along with recommendations for future research and standardization efforts. In general, the findings underscore the promising role of sCT-based approaches in personalized treatment planning and adaptive radiation therapy, with potential implications for improved oncology treatment delivery and patient care. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13972v1-abstract-full').style.display = 'none'; document.getElementById('2501.13972v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">21 pages, 14 Figures, Accepted in the IEEE Transactions on Radiation and Plasma Medical Sciences</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68T07 <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> J.2 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.13751">arXiv:2501.13751</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2501.13751">pdf</a>, <a href="https://arxiv.org/format/2501.13751">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> On Disentangled Training for Nonlinear Transform in Learned Image Compression </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Li%2C+H">Han Li</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+S">Shaohui Li</a>, <a href="/search/eess?searchtype=author&amp;query=Dai%2C+W">Wenrui Dai</a>, <a href="/search/eess?searchtype=author&amp;query=Cao%2C+M">Maida Cao</a>, <a href="/search/eess?searchtype=author&amp;query=Kan%2C+N">Nuowen Kan</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+C">Chenglin Li</a>, <a href="/search/eess?searchtype=author&amp;query=Zou%2C+J">Junni Zou</a>, <a href="/search/eess?searchtype=author&amp;query=Xiong%2C+H">Hongkai Xiong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.13751v3-abstract-short" style="display: inline;"> Learned image compression (LIC) has demonstrated superior rate-distortion (R-D) performance compared to traditional codecs, but is challenged by training inefficiency that could incur more than two weeks to train a state-of-the-art model from scratch. Existing LIC methods overlook the slow convergence caused by compacting energy in learning nonlinear transforms. In this paper, we first reveal that&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13751v3-abstract-full').style.display = 'inline'; document.getElementById('2501.13751v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.13751v3-abstract-full" style="display: none;"> Learned image compression (LIC) has demonstrated superior rate-distortion (R-D) performance compared to traditional codecs, but is challenged by training inefficiency that could incur more than two weeks to train a state-of-the-art model from scratch. Existing LIC methods overlook the slow convergence caused by compacting energy in learning nonlinear transforms. In this paper, we first reveal that such energy compaction consists of two components, i.e., feature decorrelation and uneven energy modulation. On such basis, we propose a linear auxiliary transform (AuxT) to disentangle energy compaction in training nonlinear transforms. The proposed AuxT obtains coarse approximation to achieve efficient energy compaction such that distribution fitting with the nonlinear transforms can be simplified to fine details. We then develop wavelet-based linear shortcuts (WLSs) for AuxT that leverages wavelet-based downsampling and orthogonal linear projection for feature decorrelation and subband-aware scaling for <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13751v3-abstract-full').style.display = 'none'; document.getElementById('2501.13751v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICLR2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.12644">arXiv:2501.12644</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2501.12644">pdf</a>, <a href="https://arxiv.org/format/2501.12644">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Applied Physics">physics.app-ph</span> </div> </div> <p class="title is-5 mathjax"> Current Opinions on Memristor-Accelerated Machine Learning Hardware </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Jiang%2C+M">Mingrui Jiang</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+Y">Yichun Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+Z">Zefan Li</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+C">Can Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.12644v1-abstract-short" style="display: inline;"> The unprecedented advancement of artificial intelligence has placed immense demands on computing hardware, but traditional silicon-based semiconductor technologies are approaching their physical and economic limit, prompting the exploration of novel computing paradigms. Memristor offers a promising solution, enabling in-memory analog computation and massive parallelism, which leads to low latency&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12644v1-abstract-full').style.display = 'inline'; document.getElementById('2501.12644v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.12644v1-abstract-full" style="display: none;"> The unprecedented advancement of artificial intelligence has placed immense demands on computing hardware, but traditional silicon-based semiconductor technologies are approaching their physical and economic limit, prompting the exploration of novel computing paradigms. Memristor offers a promising solution, enabling in-memory analog computation and massive parallelism, which leads to low latency and power consumption. This manuscript reviews the current status of memristor-based machine learning accelerators, highlighting the milestones achieved in developing prototype chips, that not only accelerate neural networks inference but also tackle other machine learning tasks. More importantly, it discusses our opinion on current key challenges that remain in this field, such as device variation, the need for efficient peripheral circuitry, and systematic co-design and optimization. We also share our perspective on potential future directions, some of which address existing challenges while others explore untouched territories. By addressing these challenges through interdisciplinary efforts spanning device engineering, circuit design, and systems architecture, memristor-based accelerators could significantly advance the capabilities of AI hardware, particularly for edge applications where power efficiency is paramount. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12644v1-abstract-full').style.display = 'none'; document.getElementById('2501.12644v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.12082">arXiv:2501.12082</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2501.12082">pdf</a>, <a href="https://arxiv.org/ps/2501.12082">ps</a>, <a href="https://arxiv.org/format/2501.12082">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> A Multi-annotated and Multi-modal Dataset for Wide-angle Video Quality Assessment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Hu%2C+B">Bo Hu</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+W">Wei Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+C">Chunyi Li</a>, <a href="/search/eess?searchtype=author&amp;query=He%2C+L">Lihuo He</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+L">Leida Li</a>, <a href="/search/eess?searchtype=author&amp;query=Gao%2C+X">Xinbo Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.12082v1-abstract-short" style="display: inline;"> Wide-angle video is favored for its wide viewing angle and ability to capture a large area of scenery, making it an ideal choice for sports and adventure recording. However, wide-angle video is prone to deformation, exposure and other distortions, resulting in poor video quality and affecting the perception and experience, which may seriously hinder its application in fields such as competitive sp&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12082v1-abstract-full').style.display = 'inline'; document.getElementById('2501.12082v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.12082v1-abstract-full" style="display: none;"> Wide-angle video is favored for its wide viewing angle and ability to capture a large area of scenery, making it an ideal choice for sports and adventure recording. However, wide-angle video is prone to deformation, exposure and other distortions, resulting in poor video quality and affecting the perception and experience, which may seriously hinder its application in fields such as competitive sports. Up to now, few explorations focus on the quality assessment issue of wide-angle video. This deficiency primarily stems from the absence of a specialized dataset for wide-angle videos. To bridge this gap, we construct the first Multi-annotated and multi-modal Wide-angle Video quality assessment (MWV) dataset. Then, the performances of state-of-the-art video quality methods on the MWV dataset are investigated by inter-dataset testing and intra-dataset testing. Experimental results show that these methods impose significant limitations on their applicability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12082v1-abstract-full').style.display = 'none'; document.getElementById('2501.12082v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.11093">arXiv:2501.11093</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2501.11093">pdf</a>, <a href="https://arxiv.org/format/2501.11093">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Channel Sounding Using Multiplicative Arrays Based on Successive Interference Cancellation Principle </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Jiang%2C+Z">Zhangzhang Jiang</a>, <a href="/search/eess?searchtype=author&amp;query=Yuan%2C+Z">Zhiqiang Yuan</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+C">Chunhui Li</a>, <a href="/search/eess?searchtype=author&amp;query=Yu%2C+L">Le Yu</a>, <a href="/search/eess?searchtype=author&amp;query=Fan%2C+W">Wei Fan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.11093v1-abstract-short" style="display: inline;"> Ultra-massive multiple-input and multiple-output (MIMO) systems have been seen as the key radio technology for the advancement of wireless communication systems, due to its capability to better utilize the spatial dimension of the propagation channels. Channel sounding is essential for developing accurate and realistic channel models for the massive MIMO systems. However, channel sounding with lar&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.11093v1-abstract-full').style.display = 'inline'; document.getElementById('2501.11093v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.11093v1-abstract-full" style="display: none;"> Ultra-massive multiple-input and multiple-output (MIMO) systems have been seen as the key radio technology for the advancement of wireless communication systems, due to its capability to better utilize the spatial dimension of the propagation channels. Channel sounding is essential for developing accurate and realistic channel models for the massive MIMO systems. However, channel sounding with large-scale antenna systems has faced significant challenges in practice. The real antenna array based (RAA) sounder suffers from high complexity and cost, while virtual antenna array (VAA) solutions are known for its long measurement time. Notably, these issues will become more pronounced as the antenna array configuration gets larger for future radio systems. In this paper, we propose the concept of multiplicative array (MA) for channel sounding applications to achieve large antenna aperture size with reduced number of required antenna elements. The unique characteristics of the MA are exploited for wideband spatial channel sounding purposes, supported by both one-path and multi-path numerical simulations. To address the fake paths and distortion in the angle delay profile issues inherent for MA in multipath channel sounding, a novel channel parameter estimation algorithm for MA based on successive interference cancellation (SIC) principle is proposed. Both numerical simulations and experimental validation results are provided to demonstrate the effectiveness and robustness of the proposed SIC algorithm for the MA. This research contributes significantly to the channel sounding and characterization of massive MIMO systems for future applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.11093v1-abstract-full').style.display = 'none'; document.getElementById('2501.11093v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.07897">arXiv:2501.07897</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2501.07897">pdf</a>, <a href="https://arxiv.org/format/2501.07897">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Bridge-SR: Schr枚dinger Bridge for Efficient SR </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Li%2C+C">Chang Li</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+Z">Zehua Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Bao%2C+F">Fan Bao</a>, <a href="/search/eess?searchtype=author&amp;query=Zhu%2C+J">Jun Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.07897v1-abstract-short" style="display: inline;"> Speech super-resolution (SR), which generates a waveform at a higher sampling rate from its low-resolution version, is a long-standing critical task in speech restoration. Previous works have explored speech SR in different data spaces, but these methods either require additional compression networks or exhibit limited synthesis quality and inference speed. Motivated by recent advances in probabil&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.07897v1-abstract-full').style.display = 'inline'; document.getElementById('2501.07897v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.07897v1-abstract-full" style="display: none;"> Speech super-resolution (SR), which generates a waveform at a higher sampling rate from its low-resolution version, is a long-standing critical task in speech restoration. Previous works have explored speech SR in different data spaces, but these methods either require additional compression networks or exhibit limited synthesis quality and inference speed. Motivated by recent advances in probabilistic generative models, we present Bridge-SR, a novel and efficient any-to-48kHz SR system in the speech waveform domain. Using tractable Schr枚dinger Bridge models, we leverage the observed low-resolution waveform as a prior, which is intrinsically informative for the high-resolution target. By optimizing a lightweight network to learn the score functions from the prior to the target, we achieve efficient waveform SR through a data-to-data generation process that fully exploits the instructive content contained in the low-resolution observation. Furthermore, we identify the importance of the noise schedule, data scaling, and auxiliary loss functions, which further improve the SR quality of bridge-based systems. The experiments conducted on the benchmark dataset VCTK demonstrate the efficiency of our system: (1) in terms of sample quality, Bridge-SR outperforms several strong baseline methods under different SR settings, using a lightweight network backbone (1.7M); (2) in terms of inference speed, our 4-step synthesis achieves better performance than the 8-step conditional diffusion counterpart (LSD: 0.911 vs 0.927). Demo at https://bridge-sr.github.io. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.07897v1-abstract-full').style.display = 'none'; document.getElementById('2501.07897v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at ICASSP 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.06514">arXiv:2501.06514</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2501.06514">pdf</a>, <a href="https://arxiv.org/format/2501.06514">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Neural Codec Source Tracing: Toward Comprehensive Attribution in Open-Set Condition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Xie%2C+Y">Yuankun Xie</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+X">Xiaopeng Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+Z">Zhiyong Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Fu%2C+R">Ruibo Fu</a>, <a href="/search/eess?searchtype=author&amp;query=Wen%2C+Z">Zhengqi Wen</a>, <a href="/search/eess?searchtype=author&amp;query=Cao%2C+S">Songjun Cao</a>, <a href="/search/eess?searchtype=author&amp;query=Ma%2C+L">Long Ma</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+C">Chenxing Li</a>, <a href="/search/eess?searchtype=author&amp;query=Cheng%2C+H">Haonnan Cheng</a>, <a href="/search/eess?searchtype=author&amp;query=Ye%2C+L">Long Ye</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.06514v1-abstract-short" style="display: inline;"> Current research in audio deepfake detection is gradually transitioning from binary classification to multi-class tasks, referred as audio deepfake source tracing task. However, existing studies on source tracing consider only closed-set scenarios and have not considered the challenges posed by open-set conditions. In this paper, we define the Neural Codec Source Tracing (NCST) task, which is capa&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06514v1-abstract-full').style.display = 'inline'; document.getElementById('2501.06514v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.06514v1-abstract-full" style="display: none;"> Current research in audio deepfake detection is gradually transitioning from binary classification to multi-class tasks, referred as audio deepfake source tracing task. However, existing studies on source tracing consider only closed-set scenarios and have not considered the challenges posed by open-set conditions. In this paper, we define the Neural Codec Source Tracing (NCST) task, which is capable of performing open-set neural codec classification and interpretable ALM detection. Specifically, we constructed the ST-Codecfake dataset for the NCST task, which includes bilingual audio samples generated by 11 state-of-the-art neural codec methods and ALM-based out-ofdistribution (OOD) test samples. Furthermore, we establish a comprehensive source tracing benchmark to assess NCST models in open-set conditions. The experimental results reveal that although the NCST models perform well in in-distribution (ID) classification and OOD detection, they lack robustness in classifying unseen real audio. The ST-codecfake dataset and code are available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06514v1-abstract-full').style.display = 'none'; document.getElementById('2501.06514v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.03605">arXiv:2501.03605</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2501.03605">pdf</a>, <a href="https://arxiv.org/format/2501.03605">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> ConcealGS: Concealing Invisible Copyright Information in 3D Gaussian Splatting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Yang%2C+Y">Yifeng Yang</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+H">Hengyu Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+C">Chenxin Li</a>, <a href="/search/eess?searchtype=author&amp;query=Sun%2C+Y">Yining Sun</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+W">Wuyang Li</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+Y">Yifan Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Lin%2C+Y">Yiyang Lin</a>, <a href="/search/eess?searchtype=author&amp;query=Yuan%2C+Y">Yixuan Yuan</a>, <a href="/search/eess?searchtype=author&amp;query=Ye%2C+N">Nanyang Ye</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.03605v1-abstract-short" style="display: inline;"> With the rapid development of 3D reconstruction technology, the widespread distribution of 3D data has become a future trend. While traditional visual data (such as images and videos) and NeRF-based formats already have mature techniques for copyright protection, steganographic techniques for the emerging 3D Gaussian Splatting (3D-GS) format have yet to be fully explored. To address this, we propo&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03605v1-abstract-full').style.display = 'inline'; document.getElementById('2501.03605v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.03605v1-abstract-full" style="display: none;"> With the rapid development of 3D reconstruction technology, the widespread distribution of 3D data has become a future trend. While traditional visual data (such as images and videos) and NeRF-based formats already have mature techniques for copyright protection, steganographic techniques for the emerging 3D Gaussian Splatting (3D-GS) format have yet to be fully explored. To address this, we propose ConcealGS, an innovative method for embedding implicit information into 3D-GS. By introducing the knowledge distillation and gradient optimization strategy based on 3D-GS, ConcealGS overcomes the limitations of NeRF-based models and enhances the robustness of implicit information and the quality of 3D reconstruction. We evaluate ConcealGS in various potential application scenarios, and experimental results have demonstrated that ConcealGS not only successfully recovers implicit information but also has almost no impact on rendering quality, providing a new approach for embedding invisible and recoverable information into 3D models in the future. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03605v1-abstract-full').style.display = 'none'; document.getElementById('2501.03605v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.03458">arXiv:2501.03458</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2501.03458">pdf</a>, <a href="https://arxiv.org/format/2501.03458">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Activating Associative Disease-Aware Vision Token Memory for LLM-Based X-ray Report Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Wang%2C+X">Xiao Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+F">Fuling Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+H">Haowen Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Jiang%2C+B">Bo Jiang</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+C">Chuanfu Li</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+Y">Yaowei Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Tian%2C+Y">Yonghong Tian</a>, <a href="/search/eess?searchtype=author&amp;query=Tang%2C+J">Jin Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.03458v1-abstract-short" style="display: inline;"> X-ray image based medical report generation achieves significant progress in recent years with the help of the large language model, however, these models have not fully exploited the effective information in visual image regions, resulting in reports that are linguistically sound but insufficient in describing key diseases. In this paper, we propose a novel associative memory-enhanced X-ray repor&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03458v1-abstract-full').style.display = 'inline'; document.getElementById('2501.03458v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.03458v1-abstract-full" style="display: none;"> X-ray image based medical report generation achieves significant progress in recent years with the help of the large language model, however, these models have not fully exploited the effective information in visual image regions, resulting in reports that are linguistically sound but insufficient in describing key diseases. In this paper, we propose a novel associative memory-enhanced X-ray report generation model that effectively mimics the process of professional doctors writing medical reports. It considers both the mining of global and local visual information and associates historical report information to better complete the writing of the current report. Specifically, given an X-ray image, we first utilize a classification model along with its activation maps to accomplish the mining of visual regions highly associated with diseases and the learning of disease query tokens. Then, we employ a visual Hopfield network to establish memory associations for disease-related tokens, and a report Hopfield network to retrieve report memory information. This process facilitates the generation of high-quality reports based on a large language model and achieves state-of-the-art performance on multiple benchmark datasets, including the IU X-ray, MIMIC-CXR, and Chexpert Plus. The source code of this work is released on \url{https://github.com/Event-AHU/Medical_Image_Analysis}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03458v1-abstract-full').style.display = 'none'; document.getElementById('2501.03458v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">In Peer Review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.03416">arXiv:2501.03416</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2501.03416">pdf</a>, <a href="https://arxiv.org/format/2501.03416">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> TinySense: A Lighter Weight and More Power-efficient Avionics System for Flying Insect-scale Robots </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Yu%2C+Z">Zhitao Yu</a>, <a href="/search/eess?searchtype=author&amp;query=Tran%2C+J">Joshua Tran</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+C">Claire Li</a>, <a href="/search/eess?searchtype=author&amp;query=Weber%2C+A">Aaron Weber</a>, <a href="/search/eess?searchtype=author&amp;query=Talwekar%2C+Y+P">Yash P. Talwekar</a>, <a href="/search/eess?searchtype=author&amp;query=Fuller%2C+S">Sawyer Fuller</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.03416v1-abstract-short" style="display: inline;"> In this paper, we investigate the prospects and challenges of sensor suites in achieving autonomous control for flying insect robots (FIRs) weighing less than a gram. FIRs, owing to their minuscule weight and size, offer unparalleled advantages in terms of material cost and scalability. However, their size introduces considerable control challenges, notably high-speed dynamics, restricted power, a&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03416v1-abstract-full').style.display = 'inline'; document.getElementById('2501.03416v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.03416v1-abstract-full" style="display: none;"> In this paper, we investigate the prospects and challenges of sensor suites in achieving autonomous control for flying insect robots (FIRs) weighing less than a gram. FIRs, owing to their minuscule weight and size, offer unparalleled advantages in terms of material cost and scalability. However, their size introduces considerable control challenges, notably high-speed dynamics, restricted power, and limited payload capacity. While there have been notable advancements in developing lightweight sensors, often drawing inspiration from biological systems, no sub-gram aircraft has been able to attain sustained hover without relying on feedback from external sensing such as a motion capture system. The lightest vehicle capable of sustained hover -- the first level of &#34;sensor autonomy&#34; -- is the much larger 28 g Crazyflie. Previous work reported a reduction in size of that vehicle&#39;s avionics suite to 187 mg and 21 mW. Here, we report a further reduction in mass and power to only 78.4 mg and 15 mW. We replaced the laser rangefinder with a lighter and more efficient pressure sensor, and built a smaller optic flow sensor around a global-shutter imaging chip. A Kalman Filter (KF) fuses these measurements to estimate the state variables that are needed to control hover: pitch angle, translational velocity, and altitude. Our system achieved performance comparable to that of the Crazyflie&#39;s estimator while in flight, with root mean squared errors of 1.573 degrees, 0.186 m/s, and 0.139 m, respectively, relative to motion capture. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03416v1-abstract-full').style.display = 'none'; document.getElementById('2501.03416v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to a robotics conference</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.14890">arXiv:2412.14890</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.14890">pdf</a>, <a href="https://arxiv.org/format/2412.14890">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Scale This, Not That: Investigating Key Dataset Attributes for Efficient Speech Enhancement Scaling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+L">Leying Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+W">Wangyou Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+C">Chenda Li</a>, <a href="/search/eess?searchtype=author&amp;query=Qian%2C+Y">Yanmin Qian</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.14890v1-abstract-short" style="display: inline;"> Recent speech enhancement models have shown impressive performance gains by scaling up model complexity and training data. However, the impact of dataset variability (e.g. text, language, speaker, and noise) has been underexplored. Analyzing each attribute individually is often challenging, as multiple attributes are usually entangled in commonly used datasets, posing a significant obstacle in und&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.14890v1-abstract-full').style.display = 'inline'; document.getElementById('2412.14890v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.14890v1-abstract-full" style="display: none;"> Recent speech enhancement models have shown impressive performance gains by scaling up model complexity and training data. However, the impact of dataset variability (e.g. text, language, speaker, and noise) has been underexplored. Analyzing each attribute individually is often challenging, as multiple attributes are usually entangled in commonly used datasets, posing a significant obstacle in understanding the distinct contributions of each attribute to the model&#39;s performance. To address this challenge, we propose a generation-training-evaluation framework that leverages zero-shot text-to-speech systems to investigate the impact of controlled attribute variations on speech enhancement performance. It enables us to synthesize training datasets in a scalable manner while carefully altering each attribute. Based on the proposed framework, we analyze the scaling effects of various dataset attributes on the performance of both discriminative and generative SE models. Extensive experiments on multi-domain corpora imply that acoustic attributes (e.g., speaker and noise) are much more important to current speech enhancement models than semantic attributes (e.g., language and text), offering new insights for future research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.14890v1-abstract-full').style.display = 'none'; document.getElementById('2412.14890v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.14492">arXiv:2412.14492</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.14492">pdf</a>, <a href="https://arxiv.org/format/2412.14492">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> FaultExplainer: Leveraging Large Language Models for Interpretable Fault Detection and Diagnosis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Khan%2C+A">Abdullah Khan</a>, <a href="/search/eess?searchtype=author&amp;query=Nahar%2C+R">Rahul Nahar</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+H">Hao Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Flores%2C+G+E+C">Gonzalo E. Constante Flores</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+C">Can Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.14492v1-abstract-short" style="display: inline;"> Machine learning algorithms are increasingly being applied to fault detection and diagnosis (FDD) in chemical processes. However, existing data-driven FDD platforms often lack interpretability for process operators and struggle to identify root causes of previously unseen faults. This paper presents FaultExplainer, an interactive tool designed to improve fault detection, diagnosis, and explanation&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.14492v1-abstract-full').style.display = 'inline'; document.getElementById('2412.14492v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.14492v1-abstract-full" style="display: none;"> Machine learning algorithms are increasingly being applied to fault detection and diagnosis (FDD) in chemical processes. However, existing data-driven FDD platforms often lack interpretability for process operators and struggle to identify root causes of previously unseen faults. This paper presents FaultExplainer, an interactive tool designed to improve fault detection, diagnosis, and explanation in the Tennessee Eastman Process (TEP). FaultExplainer integrates real-time sensor data visualization, Principal Component Analysis (PCA)-based fault detection, and identification of top contributing variables within an interactive user interface powered by large language models (LLMs). We evaluate the LLMs&#39; reasoning capabilities in two scenarios: one where historical root causes are provided, and one where they are not to mimic the challenge of previously unseen faults. Experimental results using GPT-4o and o1-preview models demonstrate the system&#39;s strengths in generating plausible and actionable explanations, while also highlighting its limitations, including reliance on PCA-selected features and occasional hallucinations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.14492v1-abstract-full').style.display = 'none'; document.getElementById('2412.14492v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.11039">arXiv:2412.11039</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.11039">pdf</a>, <a href="https://arxiv.org/format/2412.11039">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> A Digitalized Atlas for Pulmonary Airway </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+M">Minghui Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+C">Chenyu Li</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+H">Hanxiao Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+Y">Yaoyu Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Gu%2C+Y">Yun Gu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.11039v1-abstract-short" style="display: inline;"> In this work, we proposed AirwayAtlas, which is an end-to-end pipeline for automatic extraction of airway anatomies with lobar, segmental and subsegmental labeling. A compact representation, AirwaySign, is generated based on diverse features of airway branches. Experiments on multi-center datasets validated the effectiveness of AirwayAtlas. We also demonstrated that AirwaySign is a powerful tool f&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.11039v1-abstract-full').style.display = 'inline'; document.getElementById('2412.11039v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.11039v1-abstract-full" style="display: none;"> In this work, we proposed AirwayAtlas, which is an end-to-end pipeline for automatic extraction of airway anatomies with lobar, segmental and subsegmental labeling. A compact representation, AirwaySign, is generated based on diverse features of airway branches. Experiments on multi-center datasets validated the effectiveness of AirwayAtlas. We also demonstrated that AirwaySign is a powerful tool for correlation analysis on pulmonary diseases. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.11039v1-abstract-full').style.display = 'none'; document.getElementById('2412.11039v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Under Review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.10899">arXiv:2412.10899</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.10899">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Interharmonic Power: A New Concept for Power System Oscillation Source Location </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Xu%2C+W">Wilsun Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Yong%2C+J">Jing Yong</a>, <a href="/search/eess?searchtype=author&amp;query=Marquez%2C+H+J">Horacio J. Marquez</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+C">Chun Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.10899v1-abstract-short" style="display: inline;"> Power system oscillations are a significant concern for system operators, a problem that has grown due to the interconnection of inverter-based resources. To address this issue, various methods have been proposed to locate the sources of oscillations, which is essential for effective mitigation actions. A common characteristic of these methods is that they rely on phasor representation of oscillat&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.10899v1-abstract-full').style.display = 'inline'; document.getElementById('2412.10899v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.10899v1-abstract-full" style="display: none;"> Power system oscillations are a significant concern for system operators, a problem that has grown due to the interconnection of inverter-based resources. To address this issue, various methods have been proposed to locate the sources of oscillations, which is essential for effective mitigation actions. A common characteristic of these methods is that they rely on phasor representation of oscillation phenomena. This paper takes a different approach by examining the actual voltage and current waveforms underlying the phasors. It is found that the presence of interharmonic components is both the necessary and sufficient condition for phasor oscillations. Moreover, the generation and propagation of interharmonic powers are identified as the true culprits behind power system oscillations and oscillatory instability. Based on these insights, two new methods are developed for locating oscillation sources: one for measurement-based monitoring applications and another for model-based system studies. These findings are validated through four field data-based and one simulation-based case studies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.10899v1-abstract-full').style.display = 'none'; document.getElementById('2412.10899v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages and 27 figures. An earlier version was submitted to IEEE Trans. on Power System on Aug. 27, 2024 as TPWRS-01433-2024 (Review results unknown as of today). The current version is an improved version for record</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.08577">arXiv:2412.08577</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.08577">pdf</a>, <a href="https://arxiv.org/format/2412.08577">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Mel-Refine: A Plug-and-Play Approach to Refine Mel-Spectrogram in Audio Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Guo%2C+H">Hongming Guo</a>, <a href="/search/eess?searchtype=author&amp;query=Fu%2C+R">Ruibo Fu</a>, <a href="/search/eess?searchtype=author&amp;query=Geng%2C+Y">Yizhong Geng</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+S">Shuai Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Shi%2C+S">Shuchen Shi</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+T">Tao Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Qiang%2C+C">Chunyu Qiang</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+C">Chenxing Li</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+Y">Ya Li</a>, <a href="/search/eess?searchtype=author&amp;query=Wen%2C+Z">Zhengqi Wen</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+Y">Yukun Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+X">Xuefei Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.08577v1-abstract-short" style="display: inline;"> Text-to-audio (TTA) model is capable of generating diverse audio from textual prompts. However, most mainstream TTA models, which predominantly rely on Mel-spectrograms, still face challenges in producing audio with rich content. The intricate details and texture required in Mel-spectrograms for such audio often surpass the models&#39; capacity, leading to outputs that are blurred or lack coherence. I&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.08577v1-abstract-full').style.display = 'inline'; document.getElementById('2412.08577v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.08577v1-abstract-full" style="display: none;"> Text-to-audio (TTA) model is capable of generating diverse audio from textual prompts. However, most mainstream TTA models, which predominantly rely on Mel-spectrograms, still face challenges in producing audio with rich content. The intricate details and texture required in Mel-spectrograms for such audio often surpass the models&#39; capacity, leading to outputs that are blurred or lack coherence. In this paper, we begin by investigating the critical role of U-Net in Mel-spectrogram generation. Our analysis shows that in U-Net structure, high-frequency components in skip-connections and the backbone influence texture and detail, while low-frequency components in the backbone are critical for the diffusion denoising process. We further propose ``Mel-Refine&#39;&#39;, a plug-and-play approach that enhances Mel-spectrogram texture and detail by adjusting different component weights during inference. Our method requires no additional training or fine-tuning and is fully compatible with any diffusion-based TTA architecture. Experimental results show that our approach boosts performance metrics of the latest TTA model Tango2 by 25\%, demonstrating its effectiveness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.08577v1-abstract-full').style.display = 'none'; document.getElementById('2412.08577v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.06296">arXiv:2412.06296</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.06296">pdf</a>, <a href="https://arxiv.org/format/2412.06296">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> VidMusician: Video-to-Music Generation with Semantic-Rhythmic Alignment via Hierarchical Visual Features </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Li%2C+S">Sifei Li</a>, <a href="/search/eess?searchtype=author&amp;query=Yang%2C+B">Binxin Yang</a>, <a href="/search/eess?searchtype=author&amp;query=Yin%2C+C">Chunji Yin</a>, <a href="/search/eess?searchtype=author&amp;query=Sun%2C+C">Chong Sun</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+Y">Yuxin Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Dong%2C+W">Weiming Dong</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+C">Chen Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.06296v1-abstract-short" style="display: inline;"> Video-to-music generation presents significant potential in video production, requiring the generated music to be both semantically and rhythmically aligned with the video. Achieving this alignment demands advanced music generation capabilities, sophisticated video understanding, and an efficient mechanism to learn the correspondence between the two modalities. In this paper, we propose VidMusicia&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06296v1-abstract-full').style.display = 'inline'; document.getElementById('2412.06296v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.06296v1-abstract-full" style="display: none;"> Video-to-music generation presents significant potential in video production, requiring the generated music to be both semantically and rhythmically aligned with the video. Achieving this alignment demands advanced music generation capabilities, sophisticated video understanding, and an efficient mechanism to learn the correspondence between the two modalities. In this paper, we propose VidMusician, a parameter-efficient video-to-music generation framework built upon text-to-music models. VidMusician leverages hierarchical visual features to ensure semantic and rhythmic alignment between video and music. Specifically, our approach utilizes global visual features as semantic conditions and local visual features as rhythmic cues. These features are integrated into the generative backbone via cross-attention and in-attention mechanisms, respectively. Through a two-stage training process, we incrementally incorporate semantic and rhythmic features, utilizing zero initialization and identity initialization to maintain the inherent music-generative capabilities of the backbone. Additionally, we construct a diverse video-music dataset, DVMSet, encompassing various scenarios, such as promo videos, commercials, and compilations. Experiments demonstrate that VidMusician outperforms state-of-the-art methods across multiple evaluation metrics and exhibits robust performance on AI-generated videos. Samples are available at \url{https://youtu.be/EPOSXwtl1jw}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06296v1-abstract-full').style.display = 'none'; document.getElementById('2412.06296v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.06011">arXiv:2412.06011</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.06011">pdf</a>, <a href="https://arxiv.org/format/2412.06011">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> TopoCellGen: Generating Histopathology Cell Topology with a Diffusion Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Xu%2C+M">Meilong Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Gupta%2C+S">Saumya Gupta</a>, <a href="/search/eess?searchtype=author&amp;query=Hu%2C+X">Xiaoling Hu</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+C">Chen Li</a>, <a href="/search/eess?searchtype=author&amp;query=Abousamra%2C+S">Shahira Abousamra</a>, <a href="/search/eess?searchtype=author&amp;query=Samaras%2C+D">Dimitris Samaras</a>, <a href="/search/eess?searchtype=author&amp;query=Prasanna%2C+P">Prateek Prasanna</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+C">Chao Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.06011v1-abstract-short" style="display: inline;"> Accurately modeling multi-class cell topology is crucial in digital pathology, as it provides critical insights into tissue structure and pathology. The synthetic generation of cell topology enables realistic simulations of complex tissue environments, enhances downstream tasks by augmenting training data, aligns more closely with pathologists&#39; domain knowledge, and offers new opportunities for co&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06011v1-abstract-full').style.display = 'inline'; document.getElementById('2412.06011v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.06011v1-abstract-full" style="display: none;"> Accurately modeling multi-class cell topology is crucial in digital pathology, as it provides critical insights into tissue structure and pathology. The synthetic generation of cell topology enables realistic simulations of complex tissue environments, enhances downstream tasks by augmenting training data, aligns more closely with pathologists&#39; domain knowledge, and offers new opportunities for controlling and generalizing the tumor microenvironment. In this paper, we propose a novel approach that integrates topological constraints into a diffusion model to improve the generation of realistic, contextually accurate cell topologies. Our method refines the simulation of cell distributions and interactions, increasing the precision and interpretability of results in downstream tasks such as cell detection and classification. To assess the topological fidelity of generated layouts, we introduce a new metric, Topological Frechet Distance (TopoFD), which overcomes the limitations of traditional metrics like FID in evaluating topological structure. Experimental results demonstrate the effectiveness of our approach in generating multi-class cell layouts that capture intricate topological relationships. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06011v1-abstract-full').style.display = 'none'; document.getElementById('2412.06011v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages, 7 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.05290">arXiv:2412.05290</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.05290">pdf</a>, <a href="https://arxiv.org/format/2412.05290">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Memristor-Based Selective Convolutional Circuit for High-Density Salt-and-Pepper Noise Removal </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Ding%2C+B">Binghui Ding</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+L">Ling Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+C">Chuandong Li</a>, <a href="/search/eess?searchtype=author&amp;query=Huang%2C+T">Tingwen Huang</a>, <a href="/search/eess?searchtype=author&amp;query=Mitra%2C+S">Sushmita Mitra</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.05290v1-abstract-short" style="display: inline;"> In this article, we propose a memristor-based selective convolutional (MSC) circuit for salt-and-pepper (SAP) noise removal. We implement its algorithm using memristors in analog circuits. In experiments, we build the MSC model and benchmark it against a ternary selective convolutional (TSC) model. Results show that the MSC model effectively restores images corrupted by SAP noise, achieving simila&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05290v1-abstract-full').style.display = 'inline'; document.getElementById('2412.05290v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.05290v1-abstract-full" style="display: none;"> In this article, we propose a memristor-based selective convolutional (MSC) circuit for salt-and-pepper (SAP) noise removal. We implement its algorithm using memristors in analog circuits. In experiments, we build the MSC model and benchmark it against a ternary selective convolutional (TSC) model. Results show that the MSC model effectively restores images corrupted by SAP noise, achieving similar performance to the TSC model in both quantitative measures and visual quality at noise densities of up to 50%. Note that at high noise densities, the performance of the MSC model even surpasses the theoretical benchmark of its corresponding TSC model. In addition, we propose an enhanced MSC (MSCE) model based on MSC, which reduces power consumption by 57.6% compared with the MSC model while improving performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05290v1-abstract-full').style.display = 'none'; document.getElementById('2412.05290v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.00058">arXiv:2412.00058</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.00058">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Real-time volumetric free-hand ultrasound imaging for large-sized organs: A study of imaging the whole spine </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Li%2C+C">Caozhe Li</a>, <a href="/search/eess?searchtype=author&amp;query=Shen%2C+E">Enxiang Shen</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+H">Haoyang Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+Y">Yuxin Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Yuan%2C+J">Jie Yuan</a>, <a href="/search/eess?searchtype=author&amp;query=Gong%2C+L">Li Gong</a>, <a href="/search/eess?searchtype=author&amp;query=Zhao%2C+D">Di Zhao</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+W">Weijing Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Jin%2C+Z">Zhibin Jin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.00058v1-abstract-short" style="display: inline;"> Three-dimensional (3D) ultrasound imaging can overcome the limitations of conventional two dimensional (2D) ultrasound imaging in structural observation and measurement. However, conducting volumetric ultrasound imaging for large-sized organs still faces difficulties including long acquisition time, inevitable patient movement, and 3D feature recognition. In this study, we proposed a real-time vol&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.00058v1-abstract-full').style.display = 'inline'; document.getElementById('2412.00058v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.00058v1-abstract-full" style="display: none;"> Three-dimensional (3D) ultrasound imaging can overcome the limitations of conventional two dimensional (2D) ultrasound imaging in structural observation and measurement. However, conducting volumetric ultrasound imaging for large-sized organs still faces difficulties including long acquisition time, inevitable patient movement, and 3D feature recognition. In this study, we proposed a real-time volumetric free-hand ultrasound imaging system optimized for the above issues and applied it to the clinical diagnosis of scoliosis. This study employed an incremental imaging method coupled with algorithmic acceleration to enable real-time processing and visualization of the large amounts of data generated when scanning large-sized organs. Furthermore, to deal with the difficulty of image feature recognition, we proposed two tissue segmentation algorithms to reconstruct and visualize the spinal anatomy in 3D space by approximating the depth at which the bone structures are located and segmenting the ultrasound images at different depths. We validated the adaptability of our system by deploying it to multiple models of ultra-sound equipment and conducting experiments using different types of ultrasound probes. We also conducted experiments on 6 scoliosis patients and 10 normal volunteers to evaluate the performance of our proposed method. Ultrasound imaging of a volunteer spine from shoulder to crotch (more than 500 mm) was performed in 2 minutes, and the 3D imaging results displayed in real-time were compared with the corresponding X-ray images with a correlation coefficient of 0.96 in spinal curvature. Our proposed volumetric ultrasound imaging system might hold the potential to be clinically applied to other large-sized organs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.00058v1-abstract-full').style.display = 'none'; document.getElementById('2412.00058v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.19000">arXiv:2411.19000</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.19000">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> A Unified Platform for At-Home Post-Stroke Rehabilitation Enabled by Wearable Technologies and Artificial Intelligence </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Tang%2C+C">Chenyu Tang</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+R">Ruizhi Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Gao%2C+S">Shuo Gao</a>, <a href="/search/eess?searchtype=author&amp;query=Zhao%2C+Z">Zihe Zhao</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+Z">Zibo Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+J">Jiaqi Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+C">Cong Li</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+J">Junliang Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Dai%2C+Y">Yanning Dai</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+S">Shengbo Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Juan%2C+R">Ruoyu Juan</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+Q">Qiaoying Li</a>, <a href="/search/eess?searchtype=author&amp;query=Xie%2C+R">Ruimou Xie</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+X">Xuhang Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Zhou%2C+X">Xinkai Zhou</a>, <a href="/search/eess?searchtype=author&amp;query=Xia%2C+Y">Yunjia Xia</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+J">Jianan Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Lu%2C+F">Fanghao Lu</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+X">Xin Li</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+N">Ninglli Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Smielewski%2C+P">Peter Smielewski</a>, <a href="/search/eess?searchtype=author&amp;query=Pan%2C+Y">Yu Pan</a>, <a href="/search/eess?searchtype=author&amp;query=Zhao%2C+H">Hubin Zhao</a>, <a href="/search/eess?searchtype=author&amp;query=Occhipinti%2C+L+G">Luigi G. Occhipinti</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.19000v1-abstract-short" style="display: inline;"> At-home rehabilitation for post-stroke patients presents significant challenges, as continuous, personalized care is often limited outside clinical settings. Additionally, the absence of comprehensive solutions addressing diverse rehabilitation needs in home environments complicates recovery efforts. Here, we introduce a smart home platform that integrates wearable sensors, ambient monitoring, and&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.19000v1-abstract-full').style.display = 'inline'; document.getElementById('2411.19000v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.19000v1-abstract-full" style="display: none;"> At-home rehabilitation for post-stroke patients presents significant challenges, as continuous, personalized care is often limited outside clinical settings. Additionally, the absence of comprehensive solutions addressing diverse rehabilitation needs in home environments complicates recovery efforts. Here, we introduce a smart home platform that integrates wearable sensors, ambient monitoring, and large language model (LLM)-powered assistance to provide seamless health monitoring and intelligent support. The system leverages machine learning enabled plantar pressure arrays for motor recovery assessment (94% classification accuracy), a wearable eye-tracking module for cognitive evaluation, and ambient sensors for precise smart home control (100% operational success, &lt;1 s latency). Additionally, the LLM-powered agent, Auto-Care, offers real-time interventions, such as health reminders and environmental adjustments, enhancing user satisfaction by 29%. This work establishes a fully integrated platform for long-term, personalized rehabilitation, offering new possibilities for managing chronic conditions and supporting aging populations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.19000v1-abstract-full').style.display = 'none'; document.getElementById('2411.19000v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 figures, 35 references</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.18266">arXiv:2411.18266</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.18266">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Wearable intelligent throat enables natural speech in stroke patients with dysarthria </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Tang%2C+C">Chenyu Tang</a>, <a href="/search/eess?searchtype=author&amp;query=Gao%2C+S">Shuo Gao</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+C">Cong Li</a>, <a href="/search/eess?searchtype=author&amp;query=Yi%2C+W">Wentian Yi</a>, <a href="/search/eess?searchtype=author&amp;query=Jin%2C+Y">Yuxuan Jin</a>, <a href="/search/eess?searchtype=author&amp;query=Zhai%2C+X">Xiaoxue Zhai</a>, <a href="/search/eess?searchtype=author&amp;query=Lei%2C+S">Sixuan Lei</a>, <a href="/search/eess?searchtype=author&amp;query=Meng%2C+H">Hongbei Meng</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+Z">Zibo Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+M">Muzi Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+S">Shengbo Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+X">Xuhang Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+C">Chenxi Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Yang%2C+H">Hongyun Yang</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+N">Ningli Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+W">Wenyu Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Cao%2C+J">Jin Cao</a>, <a href="/search/eess?searchtype=author&amp;query=Feng%2C+X">Xiaodong Feng</a>, <a href="/search/eess?searchtype=author&amp;query=Smielewski%2C+P">Peter Smielewski</a>, <a href="/search/eess?searchtype=author&amp;query=Pan%2C+Y">Yu Pan</a>, <a href="/search/eess?searchtype=author&amp;query=Song%2C+W">Wenhui Song</a>, <a href="/search/eess?searchtype=author&amp;query=Birchall%2C+M">Martin Birchall</a>, <a href="/search/eess?searchtype=author&amp;query=Occhipinti%2C+L+G">Luigi G. Occhipinti</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.18266v2-abstract-short" style="display: inline;"> Wearable silent speech systems hold significant potential for restoring communication in patients with speech impairments. However, seamless, coherent speech remains elusive, and clinical efficacy is still unproven. Here, we present an AI-driven intelligent throat (IT) system that integrates throat muscle vibrations and carotid pulse signal sensors with large language model (LLM) processing to ena&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18266v2-abstract-full').style.display = 'inline'; document.getElementById('2411.18266v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.18266v2-abstract-full" style="display: none;"> Wearable silent speech systems hold significant potential for restoring communication in patients with speech impairments. However, seamless, coherent speech remains elusive, and clinical efficacy is still unproven. Here, we present an AI-driven intelligent throat (IT) system that integrates throat muscle vibrations and carotid pulse signal sensors with large language model (LLM) processing to enable fluent, emotionally expressive communication. The system utilizes ultrasensitive textile strain sensors to capture high-quality signals from the neck area and supports token-level processing for real-time, continuous speech decoding, enabling seamless, delay-free communication. In tests with five stroke patients with dysarthria, IT&#39;s LLM agents intelligently corrected token errors and enriched sentence-level emotional and logical coherence, achieving low error rates (4.2% word error rate, 2.9% sentence error rate) and a 55% increase in user satisfaction. This work establishes a portable, intuitive communication platform for patients with dysarthria with the potential to be applied broadly across different neurological conditions and in multi-language support systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18266v2-abstract-full').style.display = 'none'; document.getElementById('2411.18266v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 figures, 45 references</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.17051">arXiv:2411.17051</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.17051">pdf</a>, <a href="https://arxiv.org/format/2411.17051">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TSG.2023.3321376">10.1109/TSG.2023.3321376 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Computation-power Coupled Modeling for IDCs and Collaborative Optimization in ADNs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Li%2C+C">Chuyi Li</a>, <a href="/search/eess?searchtype=author&amp;query=Zheng%2C+K">Kedi Zheng</a>, <a href="/search/eess?searchtype=author&amp;query=Guo%2C+H">Hongye Guo</a>, <a href="/search/eess?searchtype=author&amp;query=Kang%2C+C">Chongqing Kang</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+Q">Qixin Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.17051v1-abstract-short" style="display: inline;"> The batch and online workload of Internet data centers (IDCs) offer temporal and spatial scheduling flexibility. Given that power generation costs vary over time and location, harnessing the flexibility of IDCs&#39; energy consumption through workload regulation can optimize the power flow within the system. This paper focuses on multi-geographically distributed IDCs managed by an Internet service com&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17051v1-abstract-full').style.display = 'inline'; document.getElementById('2411.17051v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.17051v1-abstract-full" style="display: none;"> The batch and online workload of Internet data centers (IDCs) offer temporal and spatial scheduling flexibility. Given that power generation costs vary over time and location, harnessing the flexibility of IDCs&#39; energy consumption through workload regulation can optimize the power flow within the system. This paper focuses on multi-geographically distributed IDCs managed by an Internet service company (ISC), which are aggregated as a controllable load. The load flexibility resulting from spatial load regulation of online workload is taken into account. A two-step workload scheduling mechanism is adopted, and a computation-power coupling model of ISC is established to facilitate collaborative optimization in active distribution networks (ADNs). To address the model-solving problem based on the assumption of scheduling homogeneity, a model reconstruction method is proposed. An efficient iterative algorithm is designed to solve the reconstructed model. Furthermore, the Nash bargaining solution is employed to coordinate the different optimization objectives of ISC and power system operators, thereby avoiding subjective arbitrariness. Experimental cases based on a 33-node distribution system are designed to verify the effectiveness of the model and algorithm in optimizing ISC&#39;s energy consumption and power flow within the system. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17051v1-abstract-full').style.display = 'none'; document.getElementById('2411.17051v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Paper accepted for IEEE Transactions on Smart Grid. Personal use of this material is permitted. Permission from IEEE must be obtained for all other uses</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> IEEE Transactions on Smart Grid, VOL. 15, NO. 3, MAY 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15576">arXiv:2411.15576</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.15576">pdf</a>, <a href="https://arxiv.org/format/2411.15576">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MulModSeg: Enhancing Unpaired Multi-Modal Medical Image Segmentation with Modality-Conditioned Text Embedding and Alternating Training </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Li%2C+C">Chengyin Li</a>, <a href="/search/eess?searchtype=author&amp;query=Zhu%2C+H">Hui Zhu</a>, <a href="/search/eess?searchtype=author&amp;query=Sultan%2C+R+I">Rafi Ibn Sultan</a>, <a href="/search/eess?searchtype=author&amp;query=Ebadian%2C+H+B">Hassan Bagher Ebadian</a>, <a href="/search/eess?searchtype=author&amp;query=Khanduri%2C+P">Prashant Khanduri</a>, <a href="/search/eess?searchtype=author&amp;query=Indrin%2C+C">Chetty Indrin</a>, <a href="/search/eess?searchtype=author&amp;query=Thind%2C+K">Kundan Thind</a>, <a href="/search/eess?searchtype=author&amp;query=Zhu%2C+D">Dongxiao Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15576v1-abstract-short" style="display: inline;"> In the diverse field of medical imaging, automatic segmentation has numerous applications and must handle a wide variety of input domains, such as different types of Computed Tomography (CT) scans and Magnetic Resonance (MR) images. This heterogeneity challenges automatic segmentation algorithms to maintain consistent performance across different modalities due to the requirement for spatially ali&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15576v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15576v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15576v1-abstract-full" style="display: none;"> In the diverse field of medical imaging, automatic segmentation has numerous applications and must handle a wide variety of input domains, such as different types of Computed Tomography (CT) scans and Magnetic Resonance (MR) images. This heterogeneity challenges automatic segmentation algorithms to maintain consistent performance across different modalities due to the requirement for spatially aligned and paired images. Typically, segmentation models are trained using a single modality, which limits their ability to generalize to other types of input data without employing transfer learning techniques. Additionally, leveraging complementary information from different modalities to enhance segmentation precision often necessitates substantial modifications to popular encoder-decoder designs, such as introducing multiple branched encoding or decoding paths for each modality. In this work, we propose a simple Multi-Modal Segmentation (MulModSeg) strategy to enhance medical image segmentation across multiple modalities, specifically CT and MR. It incorporates two key designs: a modality-conditioned text embedding framework via a frozen text encoder that adds modality awareness to existing segmentation frameworks without significant structural modifications or computational overhead, and an alternating training procedure that facilitates the integration of essential features from unpaired CT and MR inputs. Through extensive experiments with both Fully Convolutional Network and Transformer-based backbones, MulModSeg consistently outperforms previous methods in segmenting abdominal multi-organ and cardiac substructures for both CT and MR modalities. The code is available in this {\href{https://github.com/ChengyinLee/MulModSeg_2024}{link}}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15576v1-abstract-full').style.display = 'none'; document.getElementById('2411.15576v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by WACV-2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11116">arXiv:2411.11116</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.11116">pdf</a>, <a href="https://arxiv.org/ps/2411.11116">ps</a>, <a href="https://arxiv.org/format/2411.11116">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DBF-Net: A Dual-Branch Network with Feature Fusion for Ultrasound Image Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Xu%2C+G">Guoping Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Wu%2C+X">Ximing Wu</a>, <a href="/search/eess?searchtype=author&amp;query=Liao%2C+W">Wentao Liao</a>, <a href="/search/eess?searchtype=author&amp;query=Wu%2C+X">Xinglong Wu</a>, <a href="/search/eess?searchtype=author&amp;query=Huang%2C+Q">Qing Huang</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+C">Chang Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11116v1-abstract-short" style="display: inline;"> Accurately segmenting lesions in ultrasound images is challenging due to the difficulty in distinguishing boundaries between lesions and surrounding tissues. While deep learning has improved segmentation accuracy, there is limited focus on boundary quality and its relationship with body structures. To address this, we introduce UBBS-Net, a dual-branch deep neural network that learns the relationsh&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11116v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11116v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11116v1-abstract-full" style="display: none;"> Accurately segmenting lesions in ultrasound images is challenging due to the difficulty in distinguishing boundaries between lesions and surrounding tissues. While deep learning has improved segmentation accuracy, there is limited focus on boundary quality and its relationship with body structures. To address this, we introduce UBBS-Net, a dual-branch deep neural network that learns the relationship between body and boundary for improved segmentation. We also propose a feature fusion module to integrate body and boundary information. Evaluated on three public datasets, UBBS-Net outperforms existing methods, achieving Dice Similarity Coefficients of 81.05% for breast cancer, 76.41% for brachial plexus nerves, and 87.75% for infantile hemangioma segmentation. Our results demonstrate the effectiveness of UBBS-Net for ultrasound image segmentation. The code is available at https://github.com/apple1986/DBF-Net. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11116v1-abstract-full').style.display = 'none'; document.getElementById('2411.11116v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.08305">arXiv:2411.08305</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.08305">pdf</a>, <a href="https://arxiv.org/format/2411.08305">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Robust Divergence Learning for Missing-Modality Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Cheng%2C+R">Runze Cheng</a>, <a href="/search/eess?searchtype=author&amp;query=Sun%2C+Z">Zhongao Sun</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+Y">Ye Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+C">Chun Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.08305v1-abstract-short" style="display: inline;"> Multimodal Magnetic Resonance Imaging (MRI) provides essential complementary information for analyzing brain tumor subregions. While methods using four common MRI modalities for automatic segmentation have shown success, they often face challenges with missing modalities due to image quality issues, inconsistent protocols, allergic reactions, or cost factors. Thus, developing a segmentation paradi&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08305v1-abstract-full').style.display = 'inline'; document.getElementById('2411.08305v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.08305v1-abstract-full" style="display: none;"> Multimodal Magnetic Resonance Imaging (MRI) provides essential complementary information for analyzing brain tumor subregions. While methods using four common MRI modalities for automatic segmentation have shown success, they often face challenges with missing modalities due to image quality issues, inconsistent protocols, allergic reactions, or cost factors. Thus, developing a segmentation paradigm that handles missing modalities is clinically valuable. A novel single-modality parallel processing network framework based on H枚lder divergence and mutual information is introduced. Each modality is independently input into a shared network backbone for parallel processing, preserving unique information. Additionally, a dynamic sharing framework is introduced that adjusts network parameters based on modality availability. A H枚lder divergence and mutual information-based loss functions are used for evaluating discrepancies between predictions and labels. Extensive testing on the BraTS 2018 and BraTS 2020 datasets demonstrates that our method outperforms existing techniques in handling missing modalities and validates each component&#39;s effectiveness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08305v1-abstract-full').style.display = 'none'; document.getElementById('2411.08305v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07111">arXiv:2411.07111</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.07111">pdf</a>, <a href="https://arxiv.org/format/2411.07111">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Building a Taiwanese Mandarin Spoken Language Model: A First Attempt </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Yang%2C+C">Chih-Kai Yang</a>, <a href="/search/eess?searchtype=author&amp;query=Fu%2C+Y">Yu-Kuan Fu</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+C">Chen-An Li</a>, <a href="/search/eess?searchtype=author&amp;query=Lin%2C+Y">Yi-Cheng Lin</a>, <a href="/search/eess?searchtype=author&amp;query=Lin%2C+Y">Yu-Xiang Lin</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+W">Wei-Chih Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Chung%2C+H+L">Ho Lam Chung</a>, <a href="/search/eess?searchtype=author&amp;query=Kuan%2C+C">Chun-Yi Kuan</a>, <a href="/search/eess?searchtype=author&amp;query=Huang%2C+W">Wei-Ping Huang</a>, <a href="/search/eess?searchtype=author&amp;query=Lu%2C+K">Ke-Han Lu</a>, <a href="/search/eess?searchtype=author&amp;query=Lin%2C+T">Tzu-Quan Lin</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+H">Hsiu-Hsuan Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Hu%2C+E">En-Pei Hu</a>, <a href="/search/eess?searchtype=author&amp;query=Hsu%2C+C">Chan-Jan Hsu</a>, <a href="/search/eess?searchtype=author&amp;query=Tseng%2C+L">Liang-Hsuan Tseng</a>, <a href="/search/eess?searchtype=author&amp;query=Chiu%2C+I">I-Hsiang Chiu</a>, <a href="/search/eess?searchtype=author&amp;query=Sanga%2C+U">Ulin Sanga</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+X">Xuanjun Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Hsu%2C+P">Po-chun Hsu</a>, <a href="/search/eess?searchtype=author&amp;query=Yang%2C+S">Shu-wen Yang</a>, <a href="/search/eess?searchtype=author&amp;query=Lee%2C+H">Hung-yi Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07111v2-abstract-short" style="display: inline;"> This technical report presents our initial attempt to build a spoken large language model (LLM) for Taiwanese Mandarin, specifically tailored to enable real-time, speech-to-speech interaction in multi-turn conversations. Our end-to-end model incorporates a decoder-only transformer architecture and aims to achieve seamless interaction while preserving the conversational flow, including full-duplex&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07111v2-abstract-full').style.display = 'inline'; document.getElementById('2411.07111v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07111v2-abstract-full" style="display: none;"> This technical report presents our initial attempt to build a spoken large language model (LLM) for Taiwanese Mandarin, specifically tailored to enable real-time, speech-to-speech interaction in multi-turn conversations. Our end-to-end model incorporates a decoder-only transformer architecture and aims to achieve seamless interaction while preserving the conversational flow, including full-duplex capabilities allowing simultaneous speaking and listening. The paper also details the training process, including data preparation with synthesized dialogues and adjustments for real-time interaction. We also developed a platform to evaluate conversational fluency and response coherence in multi-turn dialogues. We hope the release of the report can contribute to the future development of spoken LLMs in Taiwanese Mandarin. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07111v2-abstract-full').style.display = 'none'; document.getElementById('2411.07111v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Work in progress</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05361">arXiv:2411.05361</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.05361">pdf</a>, <a href="https://arxiv.org/format/2411.05361">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Dynamic-SUPERB Phase-2: A Collaboratively Expanding Benchmark for Measuring the Capabilities of Spoken Language Models with 180 Tasks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Huang%2C+C">Chien-yu Huang</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+W">Wei-Chih Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Yang%2C+S">Shu-wen Yang</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+A+T">Andy T. Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+C">Chen-An Li</a>, <a href="/search/eess?searchtype=author&amp;query=Lin%2C+Y">Yu-Xiang Lin</a>, <a href="/search/eess?searchtype=author&amp;query=Tseng%2C+W">Wei-Cheng Tseng</a>, <a href="/search/eess?searchtype=author&amp;query=Diwan%2C+A">Anuj Diwan</a>, <a href="/search/eess?searchtype=author&amp;query=Shih%2C+Y">Yi-Jen Shih</a>, <a href="/search/eess?searchtype=author&amp;query=Shi%2C+J">Jiatong Shi</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+W">William Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+X">Xuanjun Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Hsiao%2C+C">Chi-Yuan Hsiao</a>, <a href="/search/eess?searchtype=author&amp;query=Peng%2C+P">Puyuan Peng</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+S">Shih-Heng Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Kuan%2C+C">Chun-Yi Kuan</a>, <a href="/search/eess?searchtype=author&amp;query=Lu%2C+K">Ke-Han Lu</a>, <a href="/search/eess?searchtype=author&amp;query=Chang%2C+K">Kai-Wei Chang</a>, <a href="/search/eess?searchtype=author&amp;query=Yang%2C+C">Chih-Kai Yang</a>, <a href="/search/eess?searchtype=author&amp;query=Ritter-Gutierrez%2C+F">Fabian Ritter-Gutierrez</a>, <a href="/search/eess?searchtype=author&amp;query=Chuang%2C+M+T">Ming To Chuang</a>, <a href="/search/eess?searchtype=author&amp;query=Huang%2C+K">Kuan-Po Huang</a>, <a href="/search/eess?searchtype=author&amp;query=Arora%2C+S">Siddhant Arora</a>, <a href="/search/eess?searchtype=author&amp;query=Lin%2C+Y">You-Kuan Lin</a>, <a href="/search/eess?searchtype=author&amp;query=Yeo%2C+E">Eunjung Yeo</a> , et al. (53 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05361v1-abstract-short" style="display: inline;"> Multimodal foundation models, such as Gemini and ChatGPT, have revolutionized human-machine interactions by seamlessly integrating various forms of data. Developing a universal spoken language model that comprehends a wide range of natural language instructions is critical for bridging communication gaps and facilitating more intuitive interactions. However, the absence of a comprehensive evaluati&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05361v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05361v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05361v1-abstract-full" style="display: none;"> Multimodal foundation models, such as Gemini and ChatGPT, have revolutionized human-machine interactions by seamlessly integrating various forms of data. Developing a universal spoken language model that comprehends a wide range of natural language instructions is critical for bridging communication gaps and facilitating more intuitive interactions. However, the absence of a comprehensive evaluation benchmark poses a significant challenge. We present Dynamic-SUPERB Phase-2, an open and evolving benchmark for the comprehensive evaluation of instruction-based universal speech models. Building upon the first generation, this second version incorporates 125 new tasks contributed collaboratively by the global research community, expanding the benchmark to a total of 180 tasks, making it the largest benchmark for speech and audio evaluation. While the first generation of Dynamic-SUPERB was limited to classification tasks, Dynamic-SUPERB Phase-2 broadens its evaluation capabilities by introducing a wide array of novel and diverse tasks, including regression and sequence generation, across speech, music, and environmental audio. Evaluation results indicate that none of the models performed well universally. SALMONN-13B excelled in English ASR, while WavLLM demonstrated high accuracy in emotion recognition, but current models still require further innovations to handle a broader range of tasks. We will soon open-source all task data and the evaluation pipeline. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05361v1-abstract-full').style.display = 'none'; document.getElementById('2411.05361v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23628">arXiv:2410.23628</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.23628">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Medical Physics">physics.med-ph</span> </div> </div> <p class="title is-5 mathjax"> Cycle-Constrained Adversarial Denoising Convolutional Network for PET Image Denoising: Multi-Dimensional Validation on Large Datasets with Reader Study and Real Low-Dose Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Hou%2C+Y">Yucun Hou</a>, <a href="/search/eess?searchtype=author&amp;query=Zhan%2C+F">Fenglin Zhan</a>, <a href="/search/eess?searchtype=author&amp;query=Cheng%2C+X">Xin Cheng</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+C">Chenxi Li</a>, <a href="/search/eess?searchtype=author&amp;query=Yuan%2C+Z">Ziquan Yuan</a>, <a href="/search/eess?searchtype=author&amp;query=Liao%2C+R">Runze Liao</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+H">Haihao Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Hua%2C+J">Jianlang Hua</a>, <a href="/search/eess?searchtype=author&amp;query=Wu%2C+J">Jing Wu</a>, <a href="/search/eess?searchtype=author&amp;query=Jiang%2C+J">Jianyong Jiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23628v1-abstract-short" style="display: inline;"> Positron emission tomography (PET) is a critical tool for diagnosing tumors and neurological disorders but poses radiation risks to patients, particularly to sensitive populations. While reducing injected radiation dose mitigates this risk, it often compromises image quality. To reconstruct full-dose-quality images from low-dose scans, we propose a Cycle-constrained Adversarial Denoising Convoluti&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23628v1-abstract-full').style.display = 'inline'; document.getElementById('2410.23628v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23628v1-abstract-full" style="display: none;"> Positron emission tomography (PET) is a critical tool for diagnosing tumors and neurological disorders but poses radiation risks to patients, particularly to sensitive populations. While reducing injected radiation dose mitigates this risk, it often compromises image quality. To reconstruct full-dose-quality images from low-dose scans, we propose a Cycle-constrained Adversarial Denoising Convolutional Network (Cycle-DCN). This model integrates a noise predictor, two discriminators, and a consistency network, and is optimized using a combination of supervised loss, adversarial loss, cycle consistency loss, identity loss, and neighboring Structural Similarity Index (SSIM) loss. Experiments were conducted on a large dataset consisting of raw PET brain data from 1,224 patients, acquired using a Siemens Biograph Vision PET/CT scanner. Each patient underwent a 120-seconds brain scan. To simulate low-dose PET conditions, images were reconstructed from shortened scan durations of 30, 12, and 5 seconds, corresponding to 1/4, 1/10, and 1/24 of the full-dose acquisition, respectively, using a custom-developed GPU-based image reconstruction software. The results show that Cycle-DCN significantly improves average Peak Signal-to-Noise Ratio (PSNR), SSIM, and Normalized Root Mean Square Error (NRMSE) across three dose levels, with improvements of up to 56%, 35%, and 71%, respectively. Additionally, it achieves contrast-to-noise ratio (CNR) and Edge Preservation Index (EPI) values that closely align with full-dose images, effectively preserving image details, tumor shape, and contrast, while resolving issues with blurred edges. The results of reader studies indicated that the images restored by Cycle-DCN consistently received the highest ratings from nuclear medicine physicians, highlighting their strong clinical relevance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23628v1-abstract-full').style.display = 'none'; document.getElementById('2410.23628v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This work has been submitted to the IEEE for possible publication</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21276">arXiv:2410.21276</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.21276">pdf</a>, <a href="https://arxiv.org/format/2410.21276">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> GPT-4o System Card </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=OpenAI"> OpenAI</a>, <a href="/search/eess?searchtype=author&amp;query=%3A"> :</a>, <a href="/search/eess?searchtype=author&amp;query=Hurst%2C+A">Aaron Hurst</a>, <a href="/search/eess?searchtype=author&amp;query=Lerer%2C+A">Adam Lerer</a>, <a href="/search/eess?searchtype=author&amp;query=Goucher%2C+A+P">Adam P. Goucher</a>, <a href="/search/eess?searchtype=author&amp;query=Perelman%2C+A">Adam Perelman</a>, <a href="/search/eess?searchtype=author&amp;query=Ramesh%2C+A">Aditya Ramesh</a>, <a href="/search/eess?searchtype=author&amp;query=Clark%2C+A">Aidan Clark</a>, <a href="/search/eess?searchtype=author&amp;query=Ostrow%2C+A">AJ Ostrow</a>, <a href="/search/eess?searchtype=author&amp;query=Welihinda%2C+A">Akila Welihinda</a>, <a href="/search/eess?searchtype=author&amp;query=Hayes%2C+A">Alan Hayes</a>, <a href="/search/eess?searchtype=author&amp;query=Radford%2C+A">Alec Radford</a>, <a href="/search/eess?searchtype=author&amp;query=M%C4%85dry%2C+A">Aleksander M膮dry</a>, <a href="/search/eess?searchtype=author&amp;query=Baker-Whitcomb%2C+A">Alex Baker-Whitcomb</a>, <a href="/search/eess?searchtype=author&amp;query=Beutel%2C+A">Alex Beutel</a>, <a href="/search/eess?searchtype=author&amp;query=Borzunov%2C+A">Alex Borzunov</a>, <a href="/search/eess?searchtype=author&amp;query=Carney%2C+A">Alex Carney</a>, <a href="/search/eess?searchtype=author&amp;query=Chow%2C+A">Alex Chow</a>, <a href="/search/eess?searchtype=author&amp;query=Kirillov%2C+A">Alex Kirillov</a>, <a href="/search/eess?searchtype=author&amp;query=Nichol%2C+A">Alex Nichol</a>, <a href="/search/eess?searchtype=author&amp;query=Paino%2C+A">Alex Paino</a>, <a href="/search/eess?searchtype=author&amp;query=Renzin%2C+A">Alex Renzin</a>, <a href="/search/eess?searchtype=author&amp;query=Passos%2C+A+T">Alex Tachard Passos</a>, <a href="/search/eess?searchtype=author&amp;query=Kirillov%2C+A">Alexander Kirillov</a>, <a href="/search/eess?searchtype=author&amp;query=Christakis%2C+A">Alexi Christakis</a> , et al. (395 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21276v1-abstract-short" style="display: inline;"> GPT-4o is an autoregressive omni model that accepts as input any combination of text, audio, image, and video, and generates any combination of text, audio, and image outputs. It&#39;s trained end-to-end across text, vision, and audio, meaning all inputs and outputs are processed by the same neural network. GPT-4o can respond to audio inputs in as little as 232 milliseconds, with an average of 320 mil&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21276v1-abstract-full').style.display = 'inline'; document.getElementById('2410.21276v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21276v1-abstract-full" style="display: none;"> GPT-4o is an autoregressive omni model that accepts as input any combination of text, audio, image, and video, and generates any combination of text, audio, and image outputs. It&#39;s trained end-to-end across text, vision, and audio, meaning all inputs and outputs are processed by the same neural network. GPT-4o can respond to audio inputs in as little as 232 milliseconds, with an average of 320 milliseconds, which is similar to human response time in conversation. It matches GPT-4 Turbo performance on text in English and code, with significant improvement on text in non-English languages, while also being much faster and 50\% cheaper in the API. GPT-4o is especially better at vision and audio understanding compared to existing models. In line with our commitment to building AI safely and consistent with our voluntary commitments to the White House, we are sharing the GPT-4o System Card, which includes our Preparedness Framework evaluations. In this System Card, we provide a detailed look at GPT-4o&#39;s capabilities, limitations, and safety evaluations across multiple categories, focusing on speech-to-speech while also evaluating text and image capabilities, and measures we&#39;ve implemented to ensure the model is safe and aligned. We also include third-party assessments on dangerous capabilities, as well as discussion of potential societal impacts of GPT-4o&#39;s text and vision capabilities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21276v1-abstract-full').style.display = 'none'; document.getElementById('2410.21276v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20466">arXiv:2410.20466</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.20466">pdf</a>, <a href="https://arxiv.org/format/2410.20466">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Guidance Disentanglement Network for Optics-Guided Thermal UAV Image Super-Resolution </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Zhao%2C+Z">Zhicheng Zhao</a>, <a href="/search/eess?searchtype=author&amp;query=Gu%2C+J">Juanjuan Gu</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+C">Chenglong Li</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+C">Chun Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Huang%2C+Z">Zhongling Huang</a>, <a href="/search/eess?searchtype=author&amp;query=Tang%2C+J">Jin Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20466v1-abstract-short" style="display: inline;"> Optics-guided Thermal UAV image Super-Resolution (OTUAV-SR) has attracted significant research interest due to its potential applications in security inspection, agricultural measurement, and object detection. Existing methods often employ single guidance model to generate the guidance features from optical images to assist thermal UAV images super-resolution. However, single guidance models make&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20466v1-abstract-full').style.display = 'inline'; document.getElementById('2410.20466v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20466v1-abstract-full" style="display: none;"> Optics-guided Thermal UAV image Super-Resolution (OTUAV-SR) has attracted significant research interest due to its potential applications in security inspection, agricultural measurement, and object detection. Existing methods often employ single guidance model to generate the guidance features from optical images to assist thermal UAV images super-resolution. However, single guidance models make it difficult to generate effective guidance features under favorable and adverse conditions in UAV scenarios, thus limiting the performance of OTUAV-SR. To address this issue, we propose a novel Guidance Disentanglement network (GDNet), which disentangles the optical image representation according to typical UAV scenario attributes to form guidance features under both favorable and adverse conditions, for robust OTUAV-SR. Moreover, we design an attribute-aware fusion module to combine all attribute-based optical guidance features, which could form a more discriminative representation and fit the attribute-agnostic guidance process. To facilitate OTUAV-SR research in complex UAV scenarios, we introduce VGTSR2.0, a large-scale benchmark dataset containing 3,500 aligned optical-thermal image pairs captured under diverse conditions and scenes. Extensive experiments on VGTSR2.0 demonstrate that GDNet significantly improves OTUAV-SR performance over state-of-the-art methods, especially in the challenging low-light and foggy environments commonly encountered in UAV scenarios. The dataset and code will be publicly available at https://github.com/Jocelyney/GDNet. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20466v1-abstract-full').style.display = 'none'; document.getElementById('2410.20466v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18 pages, 19 figures, 8 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18413">arXiv:2410.18413</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.18413">pdf</a>, <a href="https://arxiv.org/format/2410.18413">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> AC-Network-Informed DC Optimal Power Flow for Electricity Markets </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Constante-Flores%2C+G+E">Gonzalo E. Constante-Flores</a>, <a href="/search/eess?searchtype=author&amp;query=Quisaguano%2C+A+H">Andr茅 H. Quisaguano</a>, <a href="/search/eess?searchtype=author&amp;query=Conejo%2C+A+J">Antonio J. Conejo</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+C">Can Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18413v1-abstract-short" style="display: inline;"> This paper presents a parametric quadratic approximation of the AC optimal power flow (AC-OPF) problem for time-sensitive and market-based applications. The parametric approximation preserves the physics-based but simple representation provided by the DC-OPF model and leverages market and physics information encoded in the data-driven demand-dependent parameters. To enable the deployment of the pr&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18413v1-abstract-full').style.display = 'inline'; document.getElementById('2410.18413v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18413v1-abstract-full" style="display: none;"> This paper presents a parametric quadratic approximation of the AC optimal power flow (AC-OPF) problem for time-sensitive and market-based applications. The parametric approximation preserves the physics-based but simple representation provided by the DC-OPF model and leverages market and physics information encoded in the data-driven demand-dependent parameters. To enable the deployment of the proposed model for real-time applications, we propose a supervised learning approach to predict near-optimal parameters, given a certain metric concerning the dispatch quantities and locational marginal prices (LMPs). The training dataset is generated based on the solution of the accurate AC-OPF problem and a bilevel optimization problem, which calibrates parameters satisfying two market properties: cost recovery and revenue adequacy. We show the proposed approach&#39;s performance in various test systems in terms of cost and dispatch approximation errors, LMPs, market properties satisfaction, dispatch feasibility, and generalizability with respect to N-1 network topologies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18413v1-abstract-full').style.display = 'none'; document.getElementById('2410.18413v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 6 figures, 52nd Hawaii International Conference on System</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15946">arXiv:2410.15946</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.15946">pdf</a>, <a href="https://arxiv.org/format/2410.15946">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Neural Predictor for Flight Control with Payload </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Jin%2C+A">Ao Jin</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+C">Chenhao Li</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+Q">Qinyi Wang</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+Y">Ya Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Huang%2C+P">Panfeng Huang</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+F">Fan Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15946v1-abstract-short" style="display: inline;"> Aerial robotics for transporting suspended payloads as the form of freely-floating manipulator are growing great interest in recent years. However, the prior information of the payload, such as the mass, is always hard to obtain accurately in practice. The force/torque caused by payload and residual dynamics will introduce unmodeled perturbations to the system, which negatively affects the closed-&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15946v1-abstract-full').style.display = 'inline'; document.getElementById('2410.15946v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15946v1-abstract-full" style="display: none;"> Aerial robotics for transporting suspended payloads as the form of freely-floating manipulator are growing great interest in recent years. However, the prior information of the payload, such as the mass, is always hard to obtain accurately in practice. The force/torque caused by payload and residual dynamics will introduce unmodeled perturbations to the system, which negatively affects the closed-loop performance. Different from estimation-like methods, this paper proposes Neural Predictor, a learning-based approach to model force/torque caused by payload and residual dynamics as a dynamical system. It results a hybrid model including both the first-principles dynamics and the learned dynamics. This hybrid model is then integrated into a MPC framework to improve closed-loop performance. Effectiveness of proposed framework is verified extensively in both numerical simulations and real-world flight experiments. The results indicate that our approach can capture force/torque caused by payload and residual dynamics accurately, respond quickly to the changes of them and improve the closed-loop performance significantly. In particular, Neural Predictor outperforms a state-of-the-art learning-based estimator and has reduced the force and torque estimation errors by up to 66.15% and 33.33% while using less samples. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15946v1-abstract-full').style.display = 'none'; document.getElementById('2410.15946v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15175">arXiv:2410.15175</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.15175">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Medical Physics">physics.med-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Implicit neural representation for free-breathing MR fingerprinting (INR-MRF): co-registered 3D whole-liver water T1, water T2, proton density fat fraction, and R2* mapping </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Li%2C+C">Chao Li</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+J">Jiahao Li</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+J">Jinwei Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Solomon%2C+E">Eddy Solomon</a>, <a href="/search/eess?searchtype=author&amp;query=Dimov%2C+A+V">Alexey V. Dimov</a>, <a href="/search/eess?searchtype=author&amp;query=Spincemaille%2C+P">Pascal Spincemaille</a>, <a href="/search/eess?searchtype=author&amp;query=Nguyen%2C+T+D">Thanh D. Nguyen</a>, <a href="/search/eess?searchtype=author&amp;query=Prince%2C+M+R">Martin R. Prince</a>, <a href="/search/eess?searchtype=author&amp;query=Wang%2C+Y">Yi Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15175v1-abstract-short" style="display: inline;"> Purpose: To develop an MRI technique for free-breathing 3D whole-liver quantification of water T1, water T2, proton density fat fraction (PDFF), R2*. Methods: An Eight-echo spoiled gradient echo pulse sequence with spiral readout was developed by interleaving inversion recovery and T2 magnetization preparation. We propose a neural network based on a 4D and a 3D implicit neural representation (INR)&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15175v1-abstract-full').style.display = 'inline'; document.getElementById('2410.15175v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15175v1-abstract-full" style="display: none;"> Purpose: To develop an MRI technique for free-breathing 3D whole-liver quantification of water T1, water T2, proton density fat fraction (PDFF), R2*. Methods: An Eight-echo spoiled gradient echo pulse sequence with spiral readout was developed by interleaving inversion recovery and T2 magnetization preparation. We propose a neural network based on a 4D and a 3D implicit neural representation (INR) which simultaneously learns the motion deformation fields and the static reference frame MRI subspace images respectively. Water and fat singular images were separated during network training, with no need of performing retrospective water-fat separation. T1, T2, R2* and proton density fat fraction (PDFF) produced by the proposed method were validated in vivo on 10 healthy subjects, using quantitative maps generated from conventional scans as reference. Results: Our results showed minimal bias and narrow 95% limits of agreement on T1, T2, R2* and PDFF values in the liver compared to conventional breath-holding scans. Conclusions: INR-MRF enabled co-registered 3D whole liver T1, T2, R2* and PDFF mapping in a single free-breathing scan. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15175v1-abstract-full').style.display = 'none'; document.getElementById('2410.15175v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13992">arXiv:2410.13992</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.13992">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Resilience-Oriented DG Siting and Sizing Considering Energy Equity Constraint </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Li%2C+C">Chenchen Li</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+F">Fangxing Li</a>, <a href="/search/eess?searchtype=author&amp;query=Jiang%2C+S">Sufan Jiang</a>, <a href="/search/eess?searchtype=author&amp;query=Zhao%2C+J">Jin Zhao</a>, <a href="/search/eess?searchtype=author&amp;query=Fan%2C+S">Shiyuan Fan</a>, <a href="/search/eess?searchtype=author&amp;query=Tolbert%2C+L+M">Leon M. Tolbert</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13992v1-abstract-short" style="display: inline;"> Extreme weather events can cause widespread power outages and huge economic losses. Low-income customers are more vulnerable to power outages because they live in areas with poorly equipped distribution systems. However, existing approaches to improve grid resilience focus on the overall condition of the system and ignore the outage experiences of low-income customers, which leads to significant e&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13992v1-abstract-full').style.display = 'inline'; document.getElementById('2410.13992v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13992v1-abstract-full" style="display: none;"> Extreme weather events can cause widespread power outages and huge economic losses. Low-income customers are more vulnerable to power outages because they live in areas with poorly equipped distribution systems. However, existing approaches to improve grid resilience focus on the overall condition of the system and ignore the outage experiences of low-income customers, which leads to significant energy inequities in resilience. Therefore, this paper explores a new resilience-oriented planning method for distributed generator (DG) siting and sizing, by embedding an additional energy equity constraint (EEC). First, the expected load shedding index (ELSI) is defined as the ratio of the load shedding to the original load, which quantifies the resilience-oriented energy equity. Then, the DG siting and sizing problem is formulated as a two-stage stochastic programming with the EEC. The first stage determines the optimal sites and sizes of DG units under investment constraints and EECs, while the second stage optimizes expected costs of unserved load. A subsidiary variable is introduced to ensure the model&#39;s solvability. Finally, numerical studies are performed on the IEEE 33-bus and 123-bus systems to verify the effectiveness of the proposed DG planning model in achieving energy equity. Three observations are presented as future guidelines for resilience-oriented DG planning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13992v1-abstract-full').style.display = 'none'; document.getElementById('2410.13992v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.11148">arXiv:2410.11148</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.11148">pdf</a>, <a href="https://arxiv.org/format/2410.11148">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Deep unrolled primal dual network for TOF-PET list-mode image reconstruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Hu%2C+R">Rui Hu</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+C">Chenxu Li</a>, <a href="/search/eess?searchtype=author&amp;query=Tian%2C+K">Kun Tian</a>, <a href="/search/eess?searchtype=author&amp;query=Cui%2C+J">Jianan Cui</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+Y">Yunmei Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+H">Huafeng Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.11148v1-abstract-short" style="display: inline;"> Time-of-flight (TOF) information provides more accurate location data for annihilation photons, thereby enhancing the quality of PET reconstruction images and reducing noise. List-mode reconstruction has a significant advantage in handling TOF information. However, current advanced TOF PET list-mode reconstruction algorithms still require improvements when dealing with low-count data. Deep learnin&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11148v1-abstract-full').style.display = 'inline'; document.getElementById('2410.11148v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.11148v1-abstract-full" style="display: none;"> Time-of-flight (TOF) information provides more accurate location data for annihilation photons, thereby enhancing the quality of PET reconstruction images and reducing noise. List-mode reconstruction has a significant advantage in handling TOF information. However, current advanced TOF PET list-mode reconstruction algorithms still require improvements when dealing with low-count data. Deep learning algorithms have shown promising results in PET image reconstruction. Nevertheless, the incorporation of TOF information poses significant challenges related to the storage space required by deep learning methods, particularly for the advanced deep unrolled methods. In this study, we propose a deep unrolled primal dual network for TOF-PET list-mode reconstruction. The network is unrolled into multiple phases, with each phase comprising a dual network for list-mode domain updates and a primal network for image domain updates. We utilize CUDA for parallel acceleration and computation of the system matrix for TOF list-mode data, and we adopt a dynamic access strategy to mitigate memory consumption. Reconstructed images of different TOF resolutions and different count levels show that the proposed method outperforms the LM-OSEM, LM-EMTV, LM-SPDHG,LM-SPDHG-TV and FastPET method in both visually and quantitative analysis. These results demonstrate the potential application of deep unrolled methods for TOF-PET list-mode data and show better performance than current mainstream TOF-PET list-mode reconstruction algorithms, providing new insights for the application of deep learning methods in TOF list-mode data. The codes for this work are available at https://github.com/RickHH/LMPDnet <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11148v1-abstract-full').style.display = 'none'; document.getElementById('2410.11148v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 11 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.06544">arXiv:2410.06544</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.06544">pdf</a>, <a href="https://arxiv.org/format/2410.06544">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> SRC-gAudio: Sampling-Rate-Controlled Audio Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Li%2C+C">Chenxing Li</a>, <a href="/search/eess?searchtype=author&amp;query=Xu%2C+M">Manjie Xu</a>, <a href="/search/eess?searchtype=author&amp;query=Yu%2C+D">Dong Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.06544v1-abstract-short" style="display: inline;"> We introduce SRC-gAudio, a novel audio generation model designed to facilitate text-to-audio generation across a wide range of sampling rates within a single model architecture. SRC-gAudio incorporates the sampling rate as part of the generation condition to guide the diffusion-based audio generation process. Our model enables the generation of audio at multiple sampling rates with a single unifie&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06544v1-abstract-full').style.display = 'inline'; document.getElementById('2410.06544v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.06544v1-abstract-full" style="display: none;"> We introduce SRC-gAudio, a novel audio generation model designed to facilitate text-to-audio generation across a wide range of sampling rates within a single model architecture. SRC-gAudio incorporates the sampling rate as part of the generation condition to guide the diffusion-based audio generation process. Our model enables the generation of audio at multiple sampling rates with a single unified model. Furthermore, we explore the potential benefits of large-scale, low-sampling-rate data in enhancing the generation quality of high-sampling-rate audio. Through extensive experiments, we demonstrate that SRC-gAudio effectively generates audio under controlled sampling rates. Additionally, our results indicate that pre-training on low-sampling-rate data can lead to significant improvements in audio quality across various metrics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06544v1-abstract-full').style.display = 'none'; document.getElementById('2410.06544v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by APSIPA2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.05474">arXiv:2410.05474</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.05474">pdf</a>, <a href="https://arxiv.org/format/2410.05474">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> R-Bench: Are your Large Multimodal Model Robust to Real-world Corruptions? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Li%2C+C">Chunyi Li</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+J">Jianbo Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Zhang%2C+Z">Zicheng Zhang</a>, <a href="/search/eess?searchtype=author&amp;query=Wu%2C+H">Haoning Wu</a>, <a href="/search/eess?searchtype=author&amp;query=Tian%2C+Y">Yuan Tian</a>, <a href="/search/eess?searchtype=author&amp;query=Sun%2C+W">Wei Sun</a>, <a href="/search/eess?searchtype=author&amp;query=Lu%2C+G">Guo Lu</a>, <a href="/search/eess?searchtype=author&amp;query=Liu%2C+X">Xiaohong Liu</a>, <a href="/search/eess?searchtype=author&amp;query=Min%2C+X">Xiongkuo Min</a>, <a href="/search/eess?searchtype=author&amp;query=Lin%2C+W">Weisi Lin</a>, <a href="/search/eess?searchtype=author&amp;query=Zhai%2C+G">Guangtao Zhai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.05474v1-abstract-short" style="display: inline;"> The outstanding performance of Large Multimodal Models (LMMs) has made them widely applied in vision-related tasks. However, various corruptions in the real world mean that images will not be as ideal as in simulations, presenting significant challenges for the practical application of LMMs. To address this issue, we introduce R-Bench, a benchmark focused on the **Real-world Robustness of LMMs**.&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05474v1-abstract-full').style.display = 'inline'; document.getElementById('2410.05474v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.05474v1-abstract-full" style="display: none;"> The outstanding performance of Large Multimodal Models (LMMs) has made them widely applied in vision-related tasks. However, various corruptions in the real world mean that images will not be as ideal as in simulations, presenting significant challenges for the practical application of LMMs. To address this issue, we introduce R-Bench, a benchmark focused on the **Real-world Robustness of LMMs**. Specifically, we: (a) model the complete link from user capture to LMMs reception, comprising 33 corruption dimensions, including 7 steps according to the corruption sequence, and 7 groups based on low-level attributes; (b) collect reference/distorted image dataset before/after corruption, including 2,970 question-answer pairs with human labeling; (c) propose comprehensive evaluation for absolute/relative robustness and benchmark 20 mainstream LMMs. Results show that while LMMs can correctly handle the original reference images, their performance is not stable when faced with distorted images, and there is a significant gap in robustness compared to the human visual system. We hope that R-Bench will inspire improving the robustness of LMMs, **extending them from experimental simulations to the real-world application**. Check https://q-future.github.io/R-Bench for details. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05474v1-abstract-full').style.display = 'none'; document.getElementById('2410.05474v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.04366">arXiv:2410.04366</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.04366">pdf</a>, <a href="https://arxiv.org/format/2410.04366">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> RespDiff: An End-to-End Multi-scale RNN Diffusion Model for Respiratory Waveform Estimation from PPG Signals </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&amp;query=Miao%2C+Y">Yuyang Miao</a>, <a href="/search/eess?searchtype=author&amp;query=Chen%2C+Z">Zehua Chen</a>, <a href="/search/eess?searchtype=author&amp;query=Li%2C+C">Chang Li</a>, <a href="/search/eess?searchtype=author&amp;query=Mandic%2C+D">Danilo Mandic</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.04366v1-abstract-short" style="display: inline;"> Respiratory rate (RR) is a critical health indicator often monitored under inconvenient scenarios, limiting its practicality for continuous monitoring. Photoplethysmography (PPG) sensors, increasingly integrated into wearable devices, offer a chance to continuously estimate RR in a portable manner. In this paper, we propose RespDiff, an end-to-end multi-scale RNN diffusion model for respiratory wa&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04366v1-abstract-full').style.display = 'inline'; document.getElementById('2410.04366v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.04366v1-abstract-full" style="display: none;"> Respiratory rate (RR) is a critical health indicator often monitored under inconvenient scenarios, limiting its practicality for continuous monitoring. Photoplethysmography (PPG) sensors, increasingly integrated into wearable devices, offer a chance to continuously estimate RR in a portable manner. In this paper, we propose RespDiff, an end-to-end multi-scale RNN diffusion model for respiratory waveform estimation from PPG signals. RespDiff does not require hand-crafted features or the exclusion of low-quality signal segments, making it suitable for real-world scenarios. The model employs multi-scale encoders, to extract features at different resolutions, and a bidirectional RNN to process PPG signals and extract respiratory waveform. Additionally, a spectral loss term is introduced to optimize the model further. Experiments conducted on the BIDMC dataset demonstrate that RespDiff outperforms notable previous works, achieving a mean absolute error (MAE) of 1.18 bpm for RR estimation while others range from 1.66 to 2.15 bpm, showing its potential for robust and accurate respiratory monitoring in real-world applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04366v1-abstract-full').style.display = 'none'; document.getElementById('2410.04366v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Li%2C+C&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Li%2C+C&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Li%2C+C&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Li%2C+C&amp;start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Li%2C+C&amp;start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Li%2C+C&amp;start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">&hellip;</span></li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

Pages: 1 2 3 4 5 6 7 8 9 10