Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 519 results for author: <span class="mathjax">Zhou, Y</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/eess" aria-role="search"> Searching in archive <strong>eess</strong>. <a href="/search/?searchtype=author&query=Zhou%2C+Y">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Zhou, Y"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Zhou%2C+Y&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Zhou, Y"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Zhou%2C+Y&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Zhou%2C+Y&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhou%2C+Y&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhou%2C+Y&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhou%2C+Y&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhou%2C+Y&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.13988">arXiv:2502.13988</a> <span> [<a href="https://arxiv.org/pdf/2502.13988">pdf</a>, <a href="https://arxiv.org/format/2502.13988">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> A Lightweight Model for Perceptual Image Compression via Implicit Priors </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wei%2C+H">Hao Wei</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+Y">Yanhui Zhou</a>, <a href="/search/eess?searchtype=author&query=Jia%2C+Y">Yiwen Jia</a>, <a href="/search/eess?searchtype=author&query=Ge%2C+C">Chenyang Ge</a>, <a href="/search/eess?searchtype=author&query=Anwar%2C+S">Saeed Anwar</a>, <a href="/search/eess?searchtype=author&query=Mian%2C+A">Ajmal Mian</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.13988v1-abstract-short" style="display: inline;"> Perceptual image compression has shown strong potential for producing visually appealing results at low bitrates, surpassing classical standards and pixel-wise distortion-oriented neural methods. However, existing methods typically improve compression performance by incorporating explicit semantic priors, such as segmentation maps and textual features, into the encoder or decoder, which increases… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13988v1-abstract-full').style.display = 'inline'; document.getElementById('2502.13988v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.13988v1-abstract-full" style="display: none;"> Perceptual image compression has shown strong potential for producing visually appealing results at low bitrates, surpassing classical standards and pixel-wise distortion-oriented neural methods. However, existing methods typically improve compression performance by incorporating explicit semantic priors, such as segmentation maps and textual features, into the encoder or decoder, which increases model complexity by adding parameters and floating-point operations. This limits the model's practicality, as image compression often occurs on resource-limited mobile devices. To alleviate this problem, we propose a lightweight perceptual Image Compression method using Implicit Semantic Priors (ICISP). We first develop an enhanced visual state space block that exploits local and global spatial dependencies to reduce redundancy. Since different frequency information contributes unequally to compression, we develop a frequency decomposition modulation block to adaptively preserve or reduce the low-frequency and high-frequency information. We establish the above blocks as the main modules of the encoder-decoder, and to further improve the perceptual quality of the reconstructed images, we develop a semantic-informed discriminator that uses implicit semantic priors from a pretrained DINOv2 encoder. Experiments on popular benchmarks show that our method achieves competitive compression performance and has significantly fewer network parameters and floating point operations than the existing state-of-the-art. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13988v1-abstract-full').style.display = 'none'; document.getElementById('2502.13988v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11946">arXiv:2502.11946</a> <span> [<a href="https://arxiv.org/pdf/2502.11946">pdf</a>, <a href="https://arxiv.org/format/2502.11946">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Step-Audio: Unified Understanding and Generation in Intelligent Speech Interaction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Huang%2C+A">Ailin Huang</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+B">Boyong Wu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+B">Bruce Wang</a>, <a href="/search/eess?searchtype=author&query=Yan%2C+C">Chao Yan</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+C">Chen Hu</a>, <a href="/search/eess?searchtype=author&query=Feng%2C+C">Chengli Feng</a>, <a href="/search/eess?searchtype=author&query=Tian%2C+F">Fei Tian</a>, <a href="/search/eess?searchtype=author&query=Shen%2C+F">Feiyu Shen</a>, <a href="/search/eess?searchtype=author&query=Li%2C+J">Jingbei Li</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+M">Mingrui Chen</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+P">Peng Liu</a>, <a href="/search/eess?searchtype=author&query=Miao%2C+R">Ruihang Miao</a>, <a href="/search/eess?searchtype=author&query=You%2C+W">Wang You</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+X">Xi Chen</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+X">Xuerui Yang</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+Y">Yechang Huang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Y">Yuxiang Zhang</a>, <a href="/search/eess?searchtype=author&query=Gong%2C+Z">Zheng Gong</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Z">Zixin Zhang</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+H">Hongyu Zhou</a>, <a href="/search/eess?searchtype=author&query=Sun%2C+J">Jianjian Sun</a>, <a href="/search/eess?searchtype=author&query=Li%2C+B">Brian Li</a>, <a href="/search/eess?searchtype=author&query=Feng%2C+C">Chengting Feng</a>, <a href="/search/eess?searchtype=author&query=Wan%2C+C">Changyi Wan</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+H">Hanpeng Hu</a> , et al. (120 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11946v2-abstract-short" style="display: inline;"> Real-time speech interaction, serving as a fundamental interface for human-machine collaboration, holds immense potential. However, current open-source models face limitations such as high costs in voice data collection, weakness in dynamic control, and limited intelligence. To address these challenges, this paper introduces Step-Audio, the first production-ready open-source solution. Key contribu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11946v2-abstract-full').style.display = 'inline'; document.getElementById('2502.11946v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11946v2-abstract-full" style="display: none;"> Real-time speech interaction, serving as a fundamental interface for human-machine collaboration, holds immense potential. However, current open-source models face limitations such as high costs in voice data collection, weakness in dynamic control, and limited intelligence. To address these challenges, this paper introduces Step-Audio, the first production-ready open-source solution. Key contributions include: 1) a 130B-parameter unified speech-text multi-modal model that achieves unified understanding and generation, with the Step-Audio-Chat version open-sourced; 2) a generative speech data engine that establishes an affordable voice cloning framework and produces the open-sourced lightweight Step-Audio-TTS-3B model through distillation; 3) an instruction-driven fine control system enabling dynamic adjustments across dialects, emotions, singing, and RAP; 4) an enhanced cognitive architecture augmented with tool calling and role-playing abilities to manage complex tasks effectively. Based on our new StepEval-Audio-360 evaluation benchmark, Step-Audio achieves state-of-the-art performance in human evaluations, especially in terms of instruction following. On open-source benchmarks like LLaMA Question, shows 9.3% average performance improvement, demonstrating our commitment to advancing the development of open-source multi-modal language technologies. Our code and models are available at https://github.com/stepfun-ai/Step-Audio. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11946v2-abstract-full').style.display = 'none'; document.getElementById('2502.11946v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09662">arXiv:2502.09662</a> <span> [<a href="https://arxiv.org/pdf/2502.09662">pdf</a>, <a href="https://arxiv.org/format/2502.09662">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Generalizable Cervical Cancer Screening via Large-scale Pretraining and Test-Time Adaptation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Jiang%2C+H">Hao Jiang</a>, <a href="/search/eess?searchtype=author&query=Jin%2C+C">Cheng Jin</a>, <a href="/search/eess?searchtype=author&query=Lin%2C+H">Huangjing Lin</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+Y">Yanning Zhou</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+X">Xi Wang</a>, <a href="/search/eess?searchtype=author&query=Ma%2C+J">Jiabo Ma</a>, <a href="/search/eess?searchtype=author&query=Ding%2C+L">Li Ding</a>, <a href="/search/eess?searchtype=author&query=Hou%2C+J">Jun Hou</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+R">Runsheng Liu</a>, <a href="/search/eess?searchtype=author&query=Chai%2C+Z">Zhizhong Chai</a>, <a href="/search/eess?searchtype=author&query=Luo%2C+L">Luyang Luo</a>, <a href="/search/eess?searchtype=author&query=Shi%2C+H">Huijuan Shi</a>, <a href="/search/eess?searchtype=author&query=Qian%2C+Y">Yinling Qian</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Q">Qiong Wang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+C">Changzhong Li</a>, <a href="/search/eess?searchtype=author&query=Han%2C+A">Anjia Han</a>, <a href="/search/eess?searchtype=author&query=Chan%2C+R+C+K">Ronald Cheong Kin Chan</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+H">Hao Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09662v1-abstract-short" style="display: inline;"> Cervical cancer is a leading malignancy in female reproductive system. While AI-assisted cytology offers a cost-effective and non-invasive screening solution, current systems struggle with generalizability in complex clinical scenarios. To address this issue, we introduced Smart-CCS, a generalizable Cervical Cancer Screening paradigm based on pretraining and adaptation to create robust and general… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09662v1-abstract-full').style.display = 'inline'; document.getElementById('2502.09662v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09662v1-abstract-full" style="display: none;"> Cervical cancer is a leading malignancy in female reproductive system. While AI-assisted cytology offers a cost-effective and non-invasive screening solution, current systems struggle with generalizability in complex clinical scenarios. To address this issue, we introduced Smart-CCS, a generalizable Cervical Cancer Screening paradigm based on pretraining and adaptation to create robust and generalizable screening systems. To develop and validate Smart-CCS, we first curated a large-scale, multi-center dataset named CCS-127K, which comprises a total of 127,471 cervical cytology whole-slide images collected from 48 medical centers. By leveraging large-scale self-supervised pretraining, our CCS models are equipped with strong generalization capability, potentially generalizing across diverse scenarios. Then, we incorporated test-time adaptation to specifically optimize the trained CCS model for complex clinical settings, which adapts and refines predictions, improving real-world applicability. We conducted large-scale system evaluation among various cohorts. In retrospective cohorts, Smart-CCS achieved an overall area under the curve (AUC) value of 0.965 and sensitivity of 0.913 for cancer screening on 11 internal test datasets. In external testing, system performance maintained high at 0.950 AUC across 6 independent test datasets. In prospective cohorts, our Smart-CCS achieved AUCs of 0.947, 0.924, and 0.986 in three prospective centers, respectively. Moreover, the system demonstrated superior sensitivity in diagnosing cervical cancer, confirming the accuracy of our cancer screening results by using histology findings for validation. Interpretability analysis with cell and slide predictions further indicated that the system's decision-making aligns with clinical practice. Smart-CCS represents a significant advancement in cancer screening across diverse clinical contexts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09662v1-abstract-full').style.display = 'none'; document.getElementById('2502.09662v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06289">arXiv:2502.06289</a> <span> [<a href="https://arxiv.org/pdf/2502.06289">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Is an Ultra Large Natural Image-Based Foundation Model Superior to a Retina-Specific Model for Detecting Ocular and Systemic Diseases? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Hou%2C+Q">Qingshan Hou</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+Y">Yukun Zhou</a>, <a href="/search/eess?searchtype=author&query=Goh%2C+J+H+L">Jocelyn Hui Lin Goh</a>, <a href="/search/eess?searchtype=author&query=Zou%2C+K">Ke Zou</a>, <a href="/search/eess?searchtype=author&query=Yew%2C+S+M+E">Samantha Min Er Yew</a>, <a href="/search/eess?searchtype=author&query=Srinivasan%2C+S">Sahana Srinivasan</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+M">Meng Wang</a>, <a href="/search/eess?searchtype=author&query=Lo%2C+T">Thaddaeus Lo</a>, <a href="/search/eess?searchtype=author&query=Lei%2C+X">Xiaofeng Lei</a>, <a href="/search/eess?searchtype=author&query=Wagner%2C+S+K">Siegfried K. Wagner</a>, <a href="/search/eess?searchtype=author&query=Chia%2C+M+A">Mark A. Chia</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+D">Dawei Yang</a>, <a href="/search/eess?searchtype=author&query=Jiang%2C+H">Hongyang Jiang</a>, <a href="/search/eess?searchtype=author&query=Ran%2C+A">AnRan Ran</a>, <a href="/search/eess?searchtype=author&query=Santos%2C+R">Rui Santos</a>, <a href="/search/eess?searchtype=author&query=Somfai%2C+G+M">Gabor Mark Somfai</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+J+H">Juan Helen Zhou</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+H">Haoyu Chen</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Q">Qingyu Chen</a>, <a href="/search/eess?searchtype=author&query=Cheung%2C+C+Y">Carol Yim-Lui Cheung</a>, <a href="/search/eess?searchtype=author&query=Keane%2C+P+A">Pearse A. Keane</a>, <a href="/search/eess?searchtype=author&query=Tham%2C+Y+C">Yih Chung Tham</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06289v1-abstract-short" style="display: inline;"> The advent of foundation models (FMs) is transforming medical domain. In ophthalmology, RETFound, a retina-specific FM pre-trained sequentially on 1.4 million natural images and 1.6 million retinal images, has demonstrated high adaptability across clinical applications. Conversely, DINOv2, a general-purpose vision FM pre-trained on 142 million natural images, has shown promise in non-medical domai… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06289v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06289v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06289v1-abstract-full" style="display: none;"> The advent of foundation models (FMs) is transforming medical domain. In ophthalmology, RETFound, a retina-specific FM pre-trained sequentially on 1.4 million natural images and 1.6 million retinal images, has demonstrated high adaptability across clinical applications. Conversely, DINOv2, a general-purpose vision FM pre-trained on 142 million natural images, has shown promise in non-medical domains. However, its applicability to clinical tasks remains underexplored. To address this, we conducted head-to-head evaluations by fine-tuning RETFound and three DINOv2 models (large, base, small) for ocular disease detection and systemic disease prediction tasks, across eight standardized open-source ocular datasets, as well as the Moorfields AlzEye and the UK Biobank datasets. DINOv2-large model outperformed RETFound in detecting diabetic retinopathy (AUROC=0.850-0.952 vs 0.823-0.944, across three datasets, all P<=0.007) and multi-class eye diseases (AUROC=0.892 vs. 0.846, P<0.001). In glaucoma, DINOv2-base model outperformed RETFound (AUROC=0.958 vs 0.940, P<0.001). Conversely, RETFound achieved superior performance over all DINOv2 models in predicting heart failure, myocardial infarction, and ischaemic stroke (AUROC=0.732-0.796 vs 0.663-0.771, all P<0.001). These trends persisted even with 10% of the fine-tuning data. These findings showcase the distinct scenarios where general-purpose and domain-specific FMs excel, highlighting the importance of aligning FM selection with task-specific requirements to optimise clinical performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06289v1-abstract-full').style.display = 'none'; document.getElementById('2502.06289v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05330">arXiv:2502.05330</a> <span> [<a href="https://arxiv.org/pdf/2502.05330">pdf</a>, <a href="https://arxiv.org/format/2502.05330">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Multi-Class Segmentation of Aortic Branches and Zones in Computed Tomography Angiography: The AortaSeg24 Challenge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Imran%2C+M">Muhammad Imran</a>, <a href="/search/eess?searchtype=author&query=Krebs%2C+J+R">Jonathan R. Krebs</a>, <a href="/search/eess?searchtype=author&query=Sivaraman%2C+V+B">Vishal Balaji Sivaraman</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+T">Teng Zhang</a>, <a href="/search/eess?searchtype=author&query=Kumar%2C+A">Amarjeet Kumar</a>, <a href="/search/eess?searchtype=author&query=Ueland%2C+W+R">Walker R. Ueland</a>, <a href="/search/eess?searchtype=author&query=Fassler%2C+M+J">Michael J. Fassler</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+J">Jinlong Huang</a>, <a href="/search/eess?searchtype=author&query=Sun%2C+X">Xiao Sun</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+L">Lisheng Wang</a>, <a href="/search/eess?searchtype=author&query=Shi%2C+P">Pengcheng Shi</a>, <a href="/search/eess?searchtype=author&query=Rokuss%2C+M">Maximilian Rokuss</a>, <a href="/search/eess?searchtype=author&query=Baumgartner%2C+M">Michael Baumgartner</a>, <a href="/search/eess?searchtype=author&query=Kirchhof%2C+Y">Yannick Kirchhof</a>, <a href="/search/eess?searchtype=author&query=Maier-Hein%2C+K+H">Klaus H. Maier-Hein</a>, <a href="/search/eess?searchtype=author&query=Isensee%2C+F">Fabian Isensee</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+S">Shuolin Liu</a>, <a href="/search/eess?searchtype=author&query=Han%2C+B">Bing Han</a>, <a href="/search/eess?searchtype=author&query=Nguyen%2C+B+T">Bong Thanh Nguyen</a>, <a href="/search/eess?searchtype=author&query=Shin%2C+D">Dong-jin Shin</a>, <a href="/search/eess?searchtype=author&query=Ji-Woo%2C+P">Park Ji-Woo</a>, <a href="/search/eess?searchtype=author&query=Choi%2C+M">Mathew Choi</a>, <a href="/search/eess?searchtype=author&query=Uhm%2C+K">Kwang-Hyun Uhm</a>, <a href="/search/eess?searchtype=author&query=Ko%2C+S">Sung-Jea Ko</a>, <a href="/search/eess?searchtype=author&query=Lee%2C+C">Chanwoong Lee</a> , et al. (38 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05330v1-abstract-short" style="display: inline;"> Multi-class segmentation of the aorta in computed tomography angiography (CTA) scans is essential for diagnosing and planning complex endovascular treatments for patients with aortic dissections. However, existing methods reduce aortic segmentation to a binary problem, limiting their ability to measure diameters across different branches and zones. Furthermore, no open-source dataset is currently… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05330v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05330v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05330v1-abstract-full" style="display: none;"> Multi-class segmentation of the aorta in computed tomography angiography (CTA) scans is essential for diagnosing and planning complex endovascular treatments for patients with aortic dissections. However, existing methods reduce aortic segmentation to a binary problem, limiting their ability to measure diameters across different branches and zones. Furthermore, no open-source dataset is currently available to support the development of multi-class aortic segmentation methods. To address this gap, we organized the AortaSeg24 MICCAI Challenge, introducing the first dataset of 100 CTA volumes annotated for 23 clinically relevant aortic branches and zones. This dataset was designed to facilitate both model development and validation. The challenge attracted 121 teams worldwide, with participants leveraging state-of-the-art frameworks such as nnU-Net and exploring novel techniques, including cascaded models, data augmentation strategies, and custom loss functions. We evaluated the submitted algorithms using the Dice Similarity Coefficient (DSC) and Normalized Surface Distance (NSD), highlighting the approaches adopted by the top five performing teams. This paper presents the challenge design, dataset details, evaluation metrics, and an in-depth analysis of the top-performing algorithms. The annotated dataset, evaluation code, and implementations of the leading methods are publicly available to support further research. All resources can be accessed at https://aortaseg24.grand-challenge.org. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05330v1-abstract-full').style.display = 'none'; document.getElementById('2502.05330v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.04794">arXiv:2502.04794</a> <span> [<a href="https://arxiv.org/pdf/2502.04794">pdf</a>, <a href="https://arxiv.org/format/2502.04794">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MedMimic: Physician-Inspired Multimodal Fusion for Early Diagnosis of Fever of Unknown Origin </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Chen%2C+M">Minrui Chen</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+Y">Yi Zhou</a>, <a href="/search/eess?searchtype=author&query=Jiang%2C+H">Huidong Jiang</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+Y">Yuhan Zhu</a>, <a href="/search/eess?searchtype=author&query=Zou%2C+G">Guanjie Zou</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+M">Minqi Chen</a>, <a href="/search/eess?searchtype=author&query=Tian%2C+R">Rong Tian</a>, <a href="/search/eess?searchtype=author&query=Saigo%2C+H">Hiroto Saigo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.04794v2-abstract-short" style="display: inline;"> Fever of unknown origin FUO remains a diagnostic challenge. MedMimic is introduced as a multimodal framework inspired by real-world diagnostic processes. It uses pretrained models such as DINOv2, Vision Transformer, and ResNet-18 to convert high-dimensional 18F-FDG PET/CT imaging into low-dimensional, semantically meaningful features. A learnable self-attention-based fusion network then integrates… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04794v2-abstract-full').style.display = 'inline'; document.getElementById('2502.04794v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.04794v2-abstract-full" style="display: none;"> Fever of unknown origin FUO remains a diagnostic challenge. MedMimic is introduced as a multimodal framework inspired by real-world diagnostic processes. It uses pretrained models such as DINOv2, Vision Transformer, and ResNet-18 to convert high-dimensional 18F-FDG PET/CT imaging into low-dimensional, semantically meaningful features. A learnable self-attention-based fusion network then integrates these imaging features with clinical data for classification. Using 416 FUO patient cases from Sichuan University West China Hospital from 2017 to 2023, the multimodal fusion classification network MFCN achieved macro-AUROC scores ranging from 0.8654 to 0.9291 across seven tasks, outperforming conventional machine learning and single-modality deep learning methods. Ablation studies and five-fold cross-validation further validated its effectiveness. By combining the strengths of pretrained large models and deep learning, MedMimic offers a promising solution for disease classification. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04794v2-abstract-full').style.display = 'none'; document.getElementById('2502.04794v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.04735">arXiv:2502.04735</a> <span> [<a href="https://arxiv.org/pdf/2502.04735">pdf</a>, <a href="https://arxiv.org/format/2502.04735">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Affine Frequency Division Multiplexing: Extending OFDM for Scenario-Flexibility and Resilience </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yin%2C+H">Haoran Yin</a>, <a href="/search/eess?searchtype=author&query=Tang%2C+Y">Yanqun Tang</a>, <a href="/search/eess?searchtype=author&query=Bemani%2C+A">Ali Bemani</a>, <a href="/search/eess?searchtype=author&query=Kountouris%2C+M">Marios Kountouris</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+Y">Yu Zhou</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+X">Xingyao Zhang</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+Y">Yuqing Liu</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+G">Gaojie Chen</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+K">Kai Yang</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+F">Fan Liu</a>, <a href="/search/eess?searchtype=author&query=Masouros%2C+C">Christos Masouros</a>, <a href="/search/eess?searchtype=author&query=Li%2C+S">Shuangyang Li</a>, <a href="/search/eess?searchtype=author&query=Caire%2C+G">Giuseppe Caire</a>, <a href="/search/eess?searchtype=author&query=Xiao%2C+P">Pei Xiao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.04735v1-abstract-short" style="display: inline;"> Next-generation wireless networks are conceived to provide reliable and high-data-rate communication services for diverse scenarios, such as vehicle-to-vehicle, unmanned aerial vehicles, and satellite networks. The severe Doppler spreads in the underlying time-varying channels induce destructive inter-carrier interference (ICI) in the extensively adopted orthogonal frequency division multiplexing… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04735v1-abstract-full').style.display = 'inline'; document.getElementById('2502.04735v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.04735v1-abstract-full" style="display: none;"> Next-generation wireless networks are conceived to provide reliable and high-data-rate communication services for diverse scenarios, such as vehicle-to-vehicle, unmanned aerial vehicles, and satellite networks. The severe Doppler spreads in the underlying time-varying channels induce destructive inter-carrier interference (ICI) in the extensively adopted orthogonal frequency division multiplexing (OFDM) waveform, leading to severe performance degradation. This calls for a new air interface design that can accommodate the severe delay-Doppler spreads in highly dynamic channels while possessing sufficient flexibility to cater to various applications. This article provides a comprehensive overview of a promising chirp-based waveform named affine frequency division multiplexing (AFDM). It is featured with two tunable parameters and achieves optimal diversity order in doubly dispersive channels (DDC). We study the fundamental principle of AFDM, illustrating its intrinsic suitability for DDC. Based on that, several potential applications of AFDM are explored. Furthermore, the major challenges and the corresponding solutions of AFDM are presented, followed by several future research directions. Finally, we draw some instructive conclusions about AFDM, hoping to provide useful inspiration for its development. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04735v1-abstract-full').style.display = 'none'; document.getElementById('2502.04735v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Magazine paper submitted to IEEE</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.02024">arXiv:2502.02024</a> <span> [<a href="https://arxiv.org/pdf/2502.02024">pdf</a>, <a href="https://arxiv.org/format/2502.02024">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> UD-Mamba: A pixel-level uncertainty-driven Mamba model for medical image segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhao%2C+W">Weiren Zhao</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+F">Feng Wang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yanran Wang</a>, <a href="/search/eess?searchtype=author&query=Xie%2C+Y">Yutong Xie</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+Q">Qi Wu</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+Y">Yuyin Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.02024v1-abstract-short" style="display: inline;"> Recent advancements have highlighted the Mamba framework, a state-space model known for its efficiency in capturing long-range dependencies with linear computational complexity. While Mamba has shown competitive performance in medical image segmentation, it encounters difficulties in modeling local features due to the sporadic nature of traditional location-based scanning methods and the complex,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02024v1-abstract-full').style.display = 'inline'; document.getElementById('2502.02024v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.02024v1-abstract-full" style="display: none;"> Recent advancements have highlighted the Mamba framework, a state-space model known for its efficiency in capturing long-range dependencies with linear computational complexity. While Mamba has shown competitive performance in medical image segmentation, it encounters difficulties in modeling local features due to the sporadic nature of traditional location-based scanning methods and the complex, ambiguous boundaries often present in medical images. To overcome these challenges, we propose Uncertainty-Driven Mamba (UD-Mamba), which redefines the pixel-order scanning process by incorporating channel uncertainty into the scanning mechanism. UD-Mamba introduces two key scanning techniques: 1) sequential scanning, which prioritizes regions with high uncertainty by scanning in a row-by-row fashion, and 2) skip scanning, which processes columns vertically, moving from high-to-low or low-to-high uncertainty at fixed intervals. Sequential scanning efficiently clusters high-uncertainty regions, such as boundaries and foreground objects, to improve segmentation precision, while skip scanning enhances the interaction between background and foreground regions, allowing for timely integration of background information to support more accurate foreground inference. Recognizing the advantages of scanning from certain to uncertain areas, we introduce four learnable parameters to balance the importance of features extracted from different scanning methods. Additionally, a cosine consistency loss is employed to mitigate the drawbacks of transitioning between uncertain and certain regions during the scanning process. Our method demonstrates robust segmentation performance, validated across three distinct medical imaging datasets involving pathology, dermatological lesions, and cardiac tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02024v1-abstract-full').style.display = 'none'; document.getElementById('2502.02024v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">19 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.01986">arXiv:2502.01986</a> <span> [<a href="https://arxiv.org/pdf/2502.01986">pdf</a>, <a href="https://arxiv.org/format/2502.01986">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> DCT-Mamba3D: Spectral Decorrelation and Spatial-Spectral Feature Extraction for Hyperspectral Image Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Cao%2C+W">Weijia Cao</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+X">Xiaofei Yang</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+Y">Yicong Zhou</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Z">Zheng Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.01986v1-abstract-short" style="display: inline;"> Hyperspectral image classification presents challenges due to spectral redundancy and complex spatial-spectral dependencies. This paper proposes a novel framework, DCT-Mamba3D, for hyperspectral image classification. DCT-Mamba3D incorporates: (1) a 3D spectral-spatial decorrelation module that applies 3D discrete cosine transform basis functions to reduce both spectral and spatial redundancy, enha… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01986v1-abstract-full').style.display = 'inline'; document.getElementById('2502.01986v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.01986v1-abstract-full" style="display: none;"> Hyperspectral image classification presents challenges due to spectral redundancy and complex spatial-spectral dependencies. This paper proposes a novel framework, DCT-Mamba3D, for hyperspectral image classification. DCT-Mamba3D incorporates: (1) a 3D spectral-spatial decorrelation module that applies 3D discrete cosine transform basis functions to reduce both spectral and spatial redundancy, enhancing feature clarity across dimensions; (2) a 3D-Mamba module that leverages a bidirectional state-space model to capture intricate spatial-spectral dependencies; and (3) a global residual enhancement module that stabilizes feature representation, improving robustness and convergence. Extensive experiments on benchmark datasets show that our DCT-Mamba3D outperforms the state-of-the-art methods in challenging scenarios such as the same object in different spectra and different objects in the same spectra. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01986v1-abstract-full').style.display = 'none'; document.getElementById('2502.01986v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.15368">arXiv:2501.15368</a> <span> [<a href="https://arxiv.org/pdf/2501.15368">pdf</a>, <a href="https://arxiv.org/format/2501.15368">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Baichuan-Omni-1.5 Technical Report </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yadong Li</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+J">Jun Liu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+T">Tao Zhang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+T">Tao Zhang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+S">Song Chen</a>, <a href="/search/eess?searchtype=author&query=Li%2C+T">Tianpeng Li</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Z">Zehuan Li</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+L">Lijun Liu</a>, <a href="/search/eess?searchtype=author&query=Ming%2C+L">Lingfeng Ming</a>, <a href="/search/eess?searchtype=author&query=Dong%2C+G">Guosheng Dong</a>, <a href="/search/eess?searchtype=author&query=Pan%2C+D">Da Pan</a>, <a href="/search/eess?searchtype=author&query=Li%2C+C">Chong Li</a>, <a href="/search/eess?searchtype=author&query=Fang%2C+Y">Yuanbo Fang</a>, <a href="/search/eess?searchtype=author&query=Kuang%2C+D">Dongdong Kuang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+M">Mingrui Wang</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+C">Chenglin Zhu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Y">Youwei Zhang</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+H">Hongyu Guo</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+F">Fengyu Zhang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yuran Wang</a>, <a href="/search/eess?searchtype=author&query=Ding%2C+B">Bowen Ding</a>, <a href="/search/eess?searchtype=author&query=Song%2C+W">Wei Song</a>, <a href="/search/eess?searchtype=author&query=Li%2C+X">Xu Li</a>, <a href="/search/eess?searchtype=author&query=Huo%2C+Y">Yuqi Huo</a>, <a href="/search/eess?searchtype=author&query=Liang%2C+Z">Zheng Liang</a> , et al. (68 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.15368v1-abstract-short" style="display: inline;"> We introduce Baichuan-Omni-1.5, an omni-modal model that not only has omni-modal understanding capabilities but also provides end-to-end audio generation capabilities. To achieve fluent and high-quality interaction across modalities without compromising the capabilities of any modality, we prioritized optimizing three key aspects. First, we establish a comprehensive data cleaning and synthesis pip… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15368v1-abstract-full').style.display = 'inline'; document.getElementById('2501.15368v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.15368v1-abstract-full" style="display: none;"> We introduce Baichuan-Omni-1.5, an omni-modal model that not only has omni-modal understanding capabilities but also provides end-to-end audio generation capabilities. To achieve fluent and high-quality interaction across modalities without compromising the capabilities of any modality, we prioritized optimizing three key aspects. First, we establish a comprehensive data cleaning and synthesis pipeline for multimodal data, obtaining about 500B high-quality data (text, audio, and vision). Second, an audio-tokenizer (Baichuan-Audio-Tokenizer) has been designed to capture both semantic and acoustic information from audio, enabling seamless integration and enhanced compatibility with MLLM. Lastly, we designed a multi-stage training strategy that progressively integrates multimodal alignment and multitask fine-tuning, ensuring effective synergy across all modalities. Baichuan-Omni-1.5 leads contemporary models (including GPT4o-mini and MiniCPM-o 2.6) in terms of comprehensive omni-modal capabilities. Notably, it achieves results comparable to leading models such as Qwen2-VL-72B across various multimodal medical benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15368v1-abstract-full').style.display = 'none'; document.getElementById('2501.15368v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.14576">arXiv:2501.14576</a> <span> [<a href="https://arxiv.org/pdf/2501.14576">pdf</a>, <a href="https://arxiv.org/format/2501.14576">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Dynamic Operation and Control of a Multi-Stack Alkaline Water Electrolysis System with Shared Gas Separators and Lye Circulation: A Model-Based Study </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Qiu%2C+Y">Yiwei Qiu</a>, <a href="/search/eess?searchtype=author&query=Li%2C+J">Jiatong Li</a>, <a href="/search/eess?searchtype=author&query=Zeng%2C+Y">Yangjun Zeng</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+Y">Yi Zhou</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+S">Shi Chen</a>, <a href="/search/eess?searchtype=author&query=Qiu%2C+X">Xiaoyan Qiu</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+B">Buxiang Zhou</a>, <a href="/search/eess?searchtype=author&query=He%2C+G">Ge He</a>, <a href="/search/eess?searchtype=author&query=Ji%2C+X">Xu Ji</a>, <a href="/search/eess?searchtype=author&query=Li%2C+W">Wenying Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.14576v1-abstract-short" style="display: inline;"> An emerging approach for large-scale hydrogen production using renewable energy is to integrate multiple alkaline water electrolysis (AWE) stacks into a single balance of plant (BoP) system, sharing components such as gas-lye separation and lye circulation. This configuration, termed the $N$-in-1 AWE system, packs $N$ stacks into a modular system, reducing land requirements, the complexity of plan… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14576v1-abstract-full').style.display = 'inline'; document.getElementById('2501.14576v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.14576v1-abstract-full" style="display: none;"> An emerging approach for large-scale hydrogen production using renewable energy is to integrate multiple alkaline water electrolysis (AWE) stacks into a single balance of plant (BoP) system, sharing components such as gas-lye separation and lye circulation. This configuration, termed the $N$-in-1 AWE system, packs $N$ stacks into a modular system, reducing land requirements, the complexity of plant topology, and overall capital costs. However, the coupling of these stacks through the shared BoP introduces challenges in dynamic operation under varying energy inputs, making their performance unclear compared to traditional 1-in-1 systems. To address this, we develop a state-space model of the $N$-in-1 AWE system, capturing the dynamic behaviors of lye circulation, temperature, and HTO impurity, and their impact on energy conversion efficiency. We then propose a nonlinear model predictive controller (NMPC) to coordinately optimize inter-stack electrolytic current distribution, lye flow, and cooling, enabling the system to dynamically track varying load commands while maximizing efficiency, stabilizing temperature, and limiting HTO impurity accumulation. Simulation studies on a 4,000 Nm$^3$/h-rated 4-in-1 system verify the proposed controller under dynamic operation. Comparison with 4 independent 1-in-1 systems reveals that, with proper control, the $N$-in-1 configuration offers comparable flexibility in accommodating real-world wind power inputs. The average differences in the root-mean-square errors (RMSEs) for load-tracking and stack temperature stabilization, and specific energy consumption are below 0.014 MW, 2.356 K, and 0.003 kWh/Nm$^3$. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14576v1-abstract-full').style.display = 'none'; document.getElementById('2501.14576v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.13541">arXiv:2501.13541</a> <span> [<a href="https://arxiv.org/pdf/2501.13541">pdf</a>, <a href="https://arxiv.org/format/2501.13541">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> A Dual-Polarization Feature Fusion Network for Radar Automatic Target Recognition Based On HRRP Sequence </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhou%2C+Y">Yangbo Zhou</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+S">Sen Liu</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+H">Hong-Wei Gao</a>, <a href="/search/eess?searchtype=author&query=lin%2C+H">Hai lin</a>, <a href="/search/eess?searchtype=author&query=Wei%2C+G">Guohua Wei</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+X">Xiaoqing Wang</a>, <a href="/search/eess?searchtype=author&query=Pan%2C+X">Xiao-Min Pan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.13541v1-abstract-short" style="display: inline;"> Recent advances in radar automatic target recognition (RATR) techniques utilizing deep neural networks have demonstrated remarkable performance, largely due to their robust generalization capabilities. To address the challenge for applications with polarimetric HRRP sequences, a dual-polarization feature fusion network (DPFFN) is proposed along with a novel two-stage feature fusion strategy. Moreo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13541v1-abstract-full').style.display = 'inline'; document.getElementById('2501.13541v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.13541v1-abstract-full" style="display: none;"> Recent advances in radar automatic target recognition (RATR) techniques utilizing deep neural networks have demonstrated remarkable performance, largely due to their robust generalization capabilities. To address the challenge for applications with polarimetric HRRP sequences, a dual-polarization feature fusion network (DPFFN) is proposed along with a novel two-stage feature fusion strategy. Moreover, a specific fusion loss function is developed, which enables the adaptive generation of comprehensive multi-modal representations from polarimetric HRRP sequences. Experimental results demonstrate that the proposed network significantly improves performance in radar target recognition tasks, thus validating its effectiveness. The PyTorch implementation of our proposed DPFFN is available at https://github.com/xmpan/DPFFN. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13541v1-abstract-full').style.display = 'none'; document.getElementById('2501.13541v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.10654">arXiv:2501.10654</a> <span> [<a href="https://arxiv.org/pdf/2501.10654">pdf</a>, <a href="https://arxiv.org/format/2501.10654">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Efficient Transmission of Radiomaps via Physics-Enhanced Semantic Communications </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhou%2C+Y">Yueling Zhou</a>, <a href="/search/eess?searchtype=author&query=Wijesinghe%2C+A">Achintha Wijesinghe</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yue Wang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+S">Songyang Zhang</a>, <a href="/search/eess?searchtype=author&query=Cai%2C+Z">Zhipeng Cai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.10654v1-abstract-short" style="display: inline;"> Enriching information of spectrum coverage, radiomap plays an important role in many wireless communication applications, such as resource allocation and network optimization. To enable real-time, distributed spectrum management, particularly in the scenarios with unstable and dynamic environments, the efficient transmission of spectrum coverage information for radiomaps from edge devices to the c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10654v1-abstract-full').style.display = 'inline'; document.getElementById('2501.10654v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.10654v1-abstract-full" style="display: none;"> Enriching information of spectrum coverage, radiomap plays an important role in many wireless communication applications, such as resource allocation and network optimization. To enable real-time, distributed spectrum management, particularly in the scenarios with unstable and dynamic environments, the efficient transmission of spectrum coverage information for radiomaps from edge devices to the central server emerges as a critical problem. In this work, we propose an innovative physics-enhanced semantic communication framework tailored for efficient radiomap transmission based on generative learning models. Specifically, instead of bit-wise message passing, we only transmit the key "semantics" in radiomaps characterized by the radio propagation behavior and surrounding environments, where semantic compression schemes are utilized to reduce the communication overhead. Incorporating the novel concepts of Radio Depth Maps, the radiomaps are reconstructed from the delivered semantic information backboned on the conditional generative adversarial networks. Our framework is further extended to facilitate its implementation in the scenarios of multi-user edge computing, by integrating with federated learning for collaborative model training while preserving the data privacy. Experimental results show that our approach achieves high accuracy in radio coverage information recovery at ultra-high bandwidth efficiency, which has great potentials in many wireless-generated data transmission applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10654v1-abstract-full').style.display = 'none'; document.getElementById('2501.10654v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To appear in 2025 IEEE International Conference on Communications</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.09759">arXiv:2501.09759</a> <span> [<a href="https://arxiv.org/pdf/2501.09759">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Applied Physics">physics.app-ph</span> </div> </div> <p class="title is-5 mathjax"> A wideband amplifying and filtering reconfigurable intelligent surface for wireless relay </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wu%2C+L">Lijie Wu</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+Q+Y">Qun Yan Zhou</a>, <a href="/search/eess?searchtype=author&query=Dai%2C+J+Y">Jun Yan Dai</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+S">Siran Wang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+J">Junwei Zhang</a>, <a href="/search/eess?searchtype=author&query=Qi%2C+Z+J">Zhen Jie Qi</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+H">Hanqing Yang</a>, <a href="/search/eess?searchtype=author&query=Jiang%2C+R">Ruizhe Jiang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Z+X">Zheng Xing Wang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+H">Huidong Li</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Z">Zhen Zhang</a>, <a href="/search/eess?searchtype=author&query=Luo%2C+J">Jiang Luo</a>, <a href="/search/eess?searchtype=author&query=Cheng%2C+Q">Qiang Cheng</a>, <a href="/search/eess?searchtype=author&query=Cui%2C+T+J">Tie Jun Cui</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.09759v1-abstract-short" style="display: inline;"> Programmable metasurfaces have garnered significant attention due to their exceptional ability to manipulate electromagnetic (EM) waves in real time, leading to the emergence of a prominent area in wireless communication, namely reconfigurable intelligent surfaces (RISs), to control the signal propagation and coverage. However, the existing RISs usually suffer from limited operating distance and b… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.09759v1-abstract-full').style.display = 'inline'; document.getElementById('2501.09759v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.09759v1-abstract-full" style="display: none;"> Programmable metasurfaces have garnered significant attention due to their exceptional ability to manipulate electromagnetic (EM) waves in real time, leading to the emergence of a prominent area in wireless communication, namely reconfigurable intelligent surfaces (RISs), to control the signal propagation and coverage. However, the existing RISs usually suffer from limited operating distance and band interference, which hinder their practical applications in wireless relay and communication systems. To overcome the limitations, we propose an amplifying and filtering RIS (AF-RIS) to enhance the in-band signal energy and filter the out-of-band signal of the incident EM waves, ensuring the miniaturization of the RIS array and enabling its anti-interference ability. In addition, each AF-RIS element is equipped with a 2-bit phase control capability, further endowing the entire array with great beamforming performance. An elaborately designed 4*8 AF-RIS array is presented by integrating the power dividing and combining networks, which substantially reduces the number of amplifiers and filters, thereby reducing the hardware costs and power consumption. Experimental results showcase the powerful capabilities of AF-RIS in beam-steering, frequency selectivity, and signal amplification. Therefore, the proposed AF-RIS holds significant promise for critical applications in wireless relay systems by offering an efficient solution to improve frequency selectivity, enhance signal coverage, and reduce hardware size. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.09759v1-abstract-full').style.display = 'none'; document.getElementById('2501.09759v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.09305">arXiv:2501.09305</a> <span> [<a href="https://arxiv.org/pdf/2501.09305">pdf</a>, <a href="https://arxiv.org/format/2501.09305">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Medical Physics">physics.med-ph</span> </div> </div> <p class="title is-5 mathjax"> Domain-conditioned and Temporal-guided Diffusion Modeling for Accelerated Dynamic MRI Reconstruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+L">Liping Zhang</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+I+Y">Iris Yuwen Zhou</a>, <a href="/search/eess?searchtype=author&query=Montesi%2C+S+B">Sydney B. Montesi</a>, <a href="/search/eess?searchtype=author&query=Feng%2C+L">Li Feng</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+F">Fang Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.09305v1-abstract-short" style="display: inline;"> Purpose: To propose a domain-conditioned and temporal-guided diffusion modeling method, termed dynamic Diffusion Modeling (dDiMo), for accelerated dynamic MRI reconstruction, enabling diffusion process to characterize spatiotemporal information for time-resolved multi-coil Cartesian and non-Cartesian data. Methods: The dDiMo framework integrates temporal information from time-resolved dimensions,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.09305v1-abstract-full').style.display = 'inline'; document.getElementById('2501.09305v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.09305v1-abstract-full" style="display: none;"> Purpose: To propose a domain-conditioned and temporal-guided diffusion modeling method, termed dynamic Diffusion Modeling (dDiMo), for accelerated dynamic MRI reconstruction, enabling diffusion process to characterize spatiotemporal information for time-resolved multi-coil Cartesian and non-Cartesian data. Methods: The dDiMo framework integrates temporal information from time-resolved dimensions, allowing for the concurrent capture of intra-frame spatial features and inter-frame temporal dynamics in diffusion modeling. It employs additional spatiotemporal ($x$-$t$) and self-consistent frequency-temporal ($k$-$t$) priors to guide the diffusion process. This approach ensures precise temporal alignment and enhances the recovery of fine image details. To facilitate a smooth diffusion process, the nonlinear conjugate gradient algorithm is utilized during the reverse diffusion steps. The proposed model was tested on two types of MRI data: Cartesian-acquired multi-coil cardiac MRI and Golden-Angle-Radial-acquired multi-coil free-breathing lung MRI, across various undersampling rates. Results: dDiMo achieved high-quality reconstructions at various acceleration factors, demonstrating improved temporal alignment and structural recovery compared to other competitive reconstruction methods, both qualitatively and quantitatively. This proposed diffusion framework exhibited robust performance in handling both Cartesian and non-Cartesian acquisitions, effectively reconstructing dynamic datasets in cardiac and lung MRI under different imaging conditions. Conclusion: This study introduces a novel diffusion modeling method for dynamic MRI reconstruction. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.09305v1-abstract-full').style.display = 'none'; document.getElementById('2501.09305v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">21 pages, 15 figures, 2 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.04222">arXiv:2501.04222</a> <span> [<a href="https://arxiv.org/pdf/2501.04222">pdf</a>, <a href="https://arxiv.org/ps/2501.04222">ps</a>, <a href="https://arxiv.org/format/2501.04222">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Privacy-Preserving Distributed Online Mirror Descent for Nonconvex Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhou%2C+Y">Yingjie Zhou</a>, <a href="/search/eess?searchtype=author&query=Li%2C+T">Tao Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.04222v1-abstract-short" style="display: inline;"> We investigate the distributed online nonconvex optimization problem with differential privacy over time-varying networks. Each node minimizes the sum of several nonconvex functions while preserving the node's differential privacy. We propose a privacy-preserving distributed online mirror descent algorithm for nonconvex optimization, which uses the mirror descent to update decision variables and t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.04222v1-abstract-full').style.display = 'inline'; document.getElementById('2501.04222v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.04222v1-abstract-full" style="display: none;"> We investigate the distributed online nonconvex optimization problem with differential privacy over time-varying networks. Each node minimizes the sum of several nonconvex functions while preserving the node's differential privacy. We propose a privacy-preserving distributed online mirror descent algorithm for nonconvex optimization, which uses the mirror descent to update decision variables and the Laplace differential privacy mechanism to protect privacy. Unlike the existing works, the proposed algorithm allows the cost functions to be nonconvex, which is more applicable. Based upon these, we prove that if the communication network is $B$-strongly connected and the constraint set is compact, then by choosing the step size properly, the algorithm guarantees $蔚$-differential privacy at each time. Furthermore, we prove that if the local cost functions are $尾$-smooth, then the regret over time horizon $T$ grows sublinearly while preserving differential privacy, with an upper bound $O(\sqrt{T})$. Finally, the effectiveness of the algorithm is demonstrated through numerical simulations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.04222v1-abstract-full').style.display = 'none'; document.getElementById('2501.04222v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.03592">arXiv:2501.03592</a> <span> [<a href="https://arxiv.org/pdf/2501.03592">pdf</a>, <a href="https://arxiv.org/format/2501.03592">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optics">physics.optics</span> </div> </div> <p class="title is-5 mathjax"> A Value Mapping Virtual Staining Framework for Large-scale Histological Imaging </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+J">Junjia Wang</a>, <a href="/search/eess?searchtype=author&query=Xiong%2C+B">Bo Xiong</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+Y">You Zhou</a>, <a href="/search/eess?searchtype=author&query=Cao%2C+X">Xun Cao</a>, <a href="/search/eess?searchtype=author&query=Ma%2C+Z">Zhan Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.03592v1-abstract-short" style="display: inline;"> The emergence of virtual staining technology provides a rapid and efficient alternative for researchers in tissue pathology. It enables the utilization of unlabeled microscopic samples to generate virtual replicas of chemically stained histological slices, or facilitate the transformation of one staining type into another. The remarkable performance of generative networks, such as CycleGAN, offers… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03592v1-abstract-full').style.display = 'inline'; document.getElementById('2501.03592v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.03592v1-abstract-full" style="display: none;"> The emergence of virtual staining technology provides a rapid and efficient alternative for researchers in tissue pathology. It enables the utilization of unlabeled microscopic samples to generate virtual replicas of chemically stained histological slices, or facilitate the transformation of one staining type into another. The remarkable performance of generative networks, such as CycleGAN, offers an unsupervised learning approach for virtual coloring, overcoming the limitations of high-quality paired data required in supervised learning. Nevertheless, large-scale color transformation necessitates processing large field-of-view images in patches, often resulting in significant boundary inconsistency and artifacts. Additionally, the transformation between different colorized modalities typically needs further efforts to modify loss functions and tune hyperparameters for independent training of networks. In this study, we introduce a general virtual staining framework that is adaptable to various conditions. We propose a loss function based on the value mapping constraint to ensure the accuracy of virtual coloring between different pathological modalities, termed the Value Mapping Generative Adversarial Network (VM-GAN). Meanwhile, we present a confidence-based tiling method to address the challenge of boundary inconsistency arising from patch-wise processing. Experimental results on diverse data with varying staining protocols demonstrate that our method achieves superior quantitative indicators and improved visual perception. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03592v1-abstract-full').style.display = 'none'; document.getElementById('2501.03592v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.03293">arXiv:2501.03293</a> <span> [<a href="https://arxiv.org/pdf/2501.03293">pdf</a>, <a href="https://arxiv.org/format/2501.03293">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> K-space Diffusion Model Based MR Reconstruction Method for Simultaneous Multislice Imaging </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhao%2C+T">Ting Zhao</a>, <a href="/search/eess?searchtype=author&query=Cui%2C+Z">Zhuoxu Cui</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+C">Congcong Liu</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+X">Xingyang Wu</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+Y">Yihang Zhou</a>, <a href="/search/eess?searchtype=author&query=Liang%2C+D">Dong Liang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+H">Haifeng Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.03293v2-abstract-short" style="display: inline;"> Simultaneous Multi-Slice(SMS) is a magnetic resonance imaging (MRI) technique which excites several slices concurrently using multiband radiofrequency pulses to reduce scanning time. However, due to its variable data structure and difficulty in acquisition, it is challenging to integrate SMS data as training data into deep learning frameworks.This study proposed a novel k-space diffusion model of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03293v2-abstract-full').style.display = 'inline'; document.getElementById('2501.03293v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.03293v2-abstract-full" style="display: none;"> Simultaneous Multi-Slice(SMS) is a magnetic resonance imaging (MRI) technique which excites several slices concurrently using multiband radiofrequency pulses to reduce scanning time. However, due to its variable data structure and difficulty in acquisition, it is challenging to integrate SMS data as training data into deep learning frameworks.This study proposed a novel k-space diffusion model of SMS reconstruction that does not utilize SMS data for training. Instead, it incorporates Slice GRAPPA during the sampling process to reconstruct SMS data from different acquisition modes.Our results demonstrated that this method outperforms traditional SMS reconstruction methods and can achieve higher acceleration factors without in-plane aliasing. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03293v2-abstract-full').style.display = 'none'; document.getElementById('2501.03293v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at the 2025 IEEE 22nd International Symposium on Biomedical Imaging (ISBI)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.02953">arXiv:2501.02953</a> <span> [<a href="https://arxiv.org/pdf/2501.02953">pdf</a>, <a href="https://arxiv.org/format/2501.02953">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> SYKI-SVC: Advancing Singing Voice Conversion with Post-Processing Innovations and an Open-Source Professional Testset </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhou%2C+Y">Yiquan Zhou</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+W">Wenyu Wang</a>, <a href="/search/eess?searchtype=author&query=Ding%2C+H">Hongwu Ding</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+J">Jiacheng Xu</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+J">Jihua Zhu</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+X">Xin Gao</a>, <a href="/search/eess?searchtype=author&query=Li%2C+S">Shihao Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.02953v1-abstract-short" style="display: inline;"> Singing voice conversion aims to transform a source singing voice into that of a target singer while preserving the original lyrics, melody, and various vocal techniques. In this paper, we propose a high-fidelity singing voice conversion system. Our system builds upon the SVCC T02 framework and consists of three key components: a feature extractor, a voice converter, and a post-processor. The feat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02953v1-abstract-full').style.display = 'inline'; document.getElementById('2501.02953v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.02953v1-abstract-full" style="display: none;"> Singing voice conversion aims to transform a source singing voice into that of a target singer while preserving the original lyrics, melody, and various vocal techniques. In this paper, we propose a high-fidelity singing voice conversion system. Our system builds upon the SVCC T02 framework and consists of three key components: a feature extractor, a voice converter, and a post-processor. The feature extractor utilizes the ContentVec and Whisper models to derive F0 contours and extract speaker-independent linguistic features from the input singing voice. The voice converter then integrates the extracted timbre, F0, and linguistic content to synthesize the target speaker's waveform. The post-processor augments high-frequency information directly from the source through simple and effective signal processing to enhance audio quality. Due to the lack of a standardized professional dataset for evaluating expressive singing conversion systems, we have created and made publicly available a specialized test set. Comparative evaluations demonstrate that our system achieves a remarkably high level of naturalness, and further analysis confirms the efficacy of our proposed system design. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02953v1-abstract-full').style.display = 'none'; document.getElementById('2501.02953v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICASSP 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.02751">arXiv:2501.02751</a> <span> [<a href="https://arxiv.org/pdf/2501.02751">pdf</a>, <a href="https://arxiv.org/format/2501.02751">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> Ultrasound-QBench: Can LLMs Aid in Quality Assessment of Ultrasound Imaging? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Miao%2C+H">Hongyi Miao</a>, <a href="/search/eess?searchtype=author&query=Jia%2C+J">Jun Jia</a>, <a href="/search/eess?searchtype=author&query=Cao%2C+Y">Yankun Cao</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+Y">Yingjie Zhou</a>, <a href="/search/eess?searchtype=author&query=Jiang%2C+Y">Yanwei Jiang</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+Z">Zhi Liu</a>, <a href="/search/eess?searchtype=author&query=Zhai%2C+G">Guangtao Zhai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.02751v1-abstract-short" style="display: inline;"> With the dramatic upsurge in the volume of ultrasound examinations, low-quality ultrasound imaging has gradually increased due to variations in operator proficiency and imaging circumstances, imposing a severe burden on diagnosis accuracy and even entailing the risk of restarting the diagnosis in critical cases. To assist clinicians in selecting high-quality ultrasound images and ensuring accurate… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02751v1-abstract-full').style.display = 'inline'; document.getElementById('2501.02751v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.02751v1-abstract-full" style="display: none;"> With the dramatic upsurge in the volume of ultrasound examinations, low-quality ultrasound imaging has gradually increased due to variations in operator proficiency and imaging circumstances, imposing a severe burden on diagnosis accuracy and even entailing the risk of restarting the diagnosis in critical cases. To assist clinicians in selecting high-quality ultrasound images and ensuring accurate diagnoses, we introduce Ultrasound-QBench, a comprehensive benchmark that systematically evaluates multimodal large language models (MLLMs) on quality assessment tasks of ultrasound images. Ultrasound-QBench establishes two datasets collected from diverse sources: IVUSQA, consisting of 7,709 images, and CardiacUltraQA, containing 3,863 images. These images encompassing common ultrasound imaging artifacts are annotated by professional ultrasound experts and classified into three quality levels: high, medium, and low. To better evaluate MLLMs, we decompose the quality assessment task into three dimensionalities: qualitative classification, quantitative scoring, and comparative assessment. The evaluation of 7 open-source MLLMs as well as 1 proprietary MLLMs demonstrates that MLLMs possess preliminary capabilities for low-level visual tasks in ultrasound image quality classification. We hope this benchmark will inspire the research community to delve deeper into uncovering and enhancing the untapped potential of MLLMs for medical imaging tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02751v1-abstract-full').style.display = 'none'; document.getElementById('2501.02751v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.02747">arXiv:2501.02747</a> <span> [<a href="https://arxiv.org/pdf/2501.02747">pdf</a>, <a href="https://arxiv.org/format/2501.02747">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> An Efficient Pre-Processing Method for 6G Dynamic Ray-Tracing Channel Modeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yang%2C+S">Songjiang Yang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+C">Cheng-Xiang Wang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yinghua Wang</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+J">Jie Huang</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+Y">Yuyang Zhou</a>, <a href="/search/eess?searchtype=author&query=Aggoune%2C+e+M">el-Hadi M. Aggoune</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.02747v1-abstract-short" style="display: inline;"> The ray-tracing is often employed in urban areas for channel modeling with high accuracy but encounters a substantial computational complexity for high mobility scenarios. In this paper, we propose a novel pre-processing method for dynamic ray-tracing to reduce the computational burden in high-mobility scenarios by prepending the intersection judgment to the pre-processing stage. The proposed meth… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02747v1-abstract-full').style.display = 'inline'; document.getElementById('2501.02747v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.02747v1-abstract-full" style="display: none;"> The ray-tracing is often employed in urban areas for channel modeling with high accuracy but encounters a substantial computational complexity for high mobility scenarios. In this paper, we propose a novel pre-processing method for dynamic ray-tracing to reduce the computational burden in high-mobility scenarios by prepending the intersection judgment to the pre-processing stage. The proposed method generates an inter-visibility matrix that establishes visibility relationships among static objects in the environment considering the intersection judgment. Moreover, the inter-visibility matrix can be employed to create the inter-visibility table for mobile transmitters and receivers, which can improve the efficiency of constructing an image tree for the three-dimensional (3D) dynamic ray-tracing method. The results show that the proposed pre-processing method in dynamic ray-tracing has considerable time-saving compared with the traditional method while maintaining the same accuracy. The channel characteristics computed by the proposed method can well match to the channel measurements. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02747v1-abstract-full').style.display = 'none'; document.getElementById('2501.02747v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.02252">arXiv:2501.02252</a> <span> [<a href="https://arxiv.org/pdf/2501.02252">pdf</a>, <a href="https://arxiv.org/format/2501.02252">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Scattering Environment Aware Joint Multi-user Channel Estimation and Localization with Spatially Reused Pilots </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Tian%2C+K">Kaiyuan Tian</a>, <a href="/search/eess?searchtype=author&query=Chi%2C+Y">Yani Chi</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+Y">Yufan Zhou</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+A">An Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.02252v1-abstract-short" style="display: inline;"> The increasing number of users leads to an increase in pilot overhead, and the limited pilot resources make it challenging to support all users using orthogonal pilots. By fully capturing the inherent physical characteristics of the multi-user (MU) environment, it is possible to reduce pilot costs and improve the channel estimation performance. In reality, users nearby may share the same scatterer… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02252v1-abstract-full').style.display = 'inline'; document.getElementById('2501.02252v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.02252v1-abstract-full" style="display: none;"> The increasing number of users leads to an increase in pilot overhead, and the limited pilot resources make it challenging to support all users using orthogonal pilots. By fully capturing the inherent physical characteristics of the multi-user (MU) environment, it is possible to reduce pilot costs and improve the channel estimation performance. In reality, users nearby may share the same scatterer, while users further apart tend to have orthogonal channels. This paper proposes a two-timescale approach for joint MU uplink channel estimation and localization in MIMO-OFDM systems, which fully captures the spatial characteristics of MUs. To accurately represent the structure of the MU channel, the channel is modeled in the 3-D location domain. In the long-timescale phase, the time-space-time multiple signal classification (TST-MUSIC) algorithm initially offers a rough approximation of scatterer positions for each user, which is subsequently refined through the scatterer association algorithm based on density-based spatial clustering of applications with noise (DBSCAN) algorithm. The BS then utilizes this prior information to apply a graph-coloring-based user grouping algorithm, enabling spatial division multiplexing of pilots and reducing pilot overhead. In the short timescale phase, a low-complexity scattering environment aware location-domain turbo channel estimation (SEA-LD-TurboCE) algorithm is introduced to merge the overlapping scatterer information from MUs, facilitating high-precision joint MU channel estimation and localization under spatially reused pilots. Simulation results verify the superior channel estimation and localization performance of our proposed scheme over the baselines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02252v1-abstract-full').style.display = 'none'; document.getElementById('2501.02252v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.02001">arXiv:2501.02001</a> <span> [<a href="https://arxiv.org/pdf/2501.02001">pdf</a>, <a href="https://arxiv.org/format/2501.02001">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Communication Efficient Cooperative Edge AI via Event-Triggered Computation Offloading </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhou%2C+Y">You Zhou</a>, <a href="/search/eess?searchtype=author&query=You%2C+C">Changsheng You</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+K">Kaibin Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.02001v1-abstract-short" style="display: inline;"> Rare events, despite their infrequency, often carry critical information and require immediate attentions in mission-critical applications such as autonomous driving, healthcare, and industrial automation. The data-intensive nature of these tasks and their need for prompt responses, combined with designing edge AI (or edge inference), pose significant challenges in systems and techniques. Existing… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02001v1-abstract-full').style.display = 'inline'; document.getElementById('2501.02001v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.02001v1-abstract-full" style="display: none;"> Rare events, despite their infrequency, often carry critical information and require immediate attentions in mission-critical applications such as autonomous driving, healthcare, and industrial automation. The data-intensive nature of these tasks and their need for prompt responses, combined with designing edge AI (or edge inference), pose significant challenges in systems and techniques. Existing edge inference approaches often suffer from communication bottlenecks due to high-dimensional data transmission and fail to provide timely responses to rare events, limiting their effectiveness for mission-critical applications in the sixth-generation (6G) mobile networks. To overcome these challenges, we propose a channel-adaptive, event-triggered edge-inference framework that prioritizes efficient rare-event processing. Central to this framework is a dual-threshold, multi-exit architecture, which enables early local inference for rare events detected locally while offloading more complex rare events to edge servers for detailed classification. To further enhance the system's performance, we developed a channel-adaptive offloading policy paired with an online algorithm to dynamically determine the optimal confidence thresholds for controlling offloading decisions. The associated optimization problem is solved by reformulating the original non-convex function into an equivalent strongly convex one. Using deep neural network classifiers and real medical datasets, our experiments demonstrate that the proposed framework not only achieves superior rare-event classification accuracy, but also effectively reduces communication overhead, as opposed to existing edge-inference approaches. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02001v1-abstract-full').style.display = 'none'; document.getElementById('2501.02001v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 11 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.01614">arXiv:2501.01614</a> <span> [<a href="https://arxiv.org/pdf/2501.01614">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1177/03611981231170182">10.1177/03611981231170182 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Evaluation of Rail Decarbonization Alternatives: Framework and Application </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Hernandez%2C+A">Adrian Hernandez</a>, <a href="/search/eess?searchtype=author&query=Ng%2C+M+T">Max TM Ng</a>, <a href="/search/eess?searchtype=author&query=Siddique%2C+N">Nazib Siddique</a>, <a href="/search/eess?searchtype=author&query=Durango-Cohen%2C+P+L">Pablo L. Durango-Cohen</a>, <a href="/search/eess?searchtype=author&query=Elgowainy%2C+A">Amgad Elgowainy</a>, <a href="/search/eess?searchtype=author&query=Mahmassani%2C+H+S">Hani S. Mahmassani</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+M">Michael Wang</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+Y">Yan Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.01614v1-abstract-short" style="display: inline;"> The Northwestern University Freight Rail Infrastructure and Energy Network Decarbonization (NUFRIEND) framework is a comprehensive industry-oriented tool for simulating the deployment of new energy technologies including biofuels, e-fuels, battery-electric, and hydrogen locomotives. By classifying fuel types into two categories based on deployment requirements, the associated optimal charging/fuel… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.01614v1-abstract-full').style.display = 'inline'; document.getElementById('2501.01614v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.01614v1-abstract-full" style="display: none;"> The Northwestern University Freight Rail Infrastructure and Energy Network Decarbonization (NUFRIEND) framework is a comprehensive industry-oriented tool for simulating the deployment of new energy technologies including biofuels, e-fuels, battery-electric, and hydrogen locomotives. By classifying fuel types into two categories based on deployment requirements, the associated optimal charging/fueling facility location and sizing problem are solved with a five-step framework. Life cycle analyses (LCA) and techno-economic analyses (TEA) are used to estimate carbon reduction, capital investments, cost of carbon reduction, and operational impacts, enabling sensitivity analysis with operational and technological parameters. The framework is illustrated on lower-carbon drop-in fuels as well as battery-electric technology deployments for US Eastern and Western Class I railroad networks. Drop-in fuel deployments are modeled as admixtures with diesel in existing locomotives, while battery-electric deployments are shown for varying technology penetration levels and locomotive ranges. When mixed in a 50 percent ratio with diesel, results show biodiesel's capacity to reduce emissions at 36 percent with a cost of 0.13 USD per kilogram of CO2 reduced, while e-fuels offer a 50 percent emissions reduction potential at a cost of 0.22 USD per kilogram of CO2 reduced. Battery-electric results for 50 percent deployment over all ton-miles highlight the value of future innovations in battery energy densities as scenarios assuming 800-mile range locomotives show an estimated emissions reduction of 46 percent with a cost of 0.06 USD per kilogram of CO2 reduced, compared to 16 percent emissions reduction at a cost of 0.11 USD per kilogram of CO2 reduced for 400-mile range locomotives. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.01614v1-abstract-full').style.display = 'none'; document.getElementById('2501.01614v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">29 pages, 17 figures. This is the accepted version of a work that was published in Transportation Research Record</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Transportation Research Record 2678.1 (2024): 102-121 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.01235">arXiv:2501.01235</a> <span> [<a href="https://arxiv.org/pdf/2501.01235">pdf</a>, <a href="https://arxiv.org/format/2501.01235">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> SVFR: A Unified Framework for Generalized Video Face Restoration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+Z">Zhiyao Wang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+X">Xu Chen</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+C">Chengming Xu</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+J">Junwei Zhu</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+X">Xiaobin Hu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+J">Jiangning Zhang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+C">Chengjie Wang</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+Y">Yuqi Liu</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+Y">Yiyi Zhou</a>, <a href="/search/eess?searchtype=author&query=Ji%2C+R">Rongrong Ji</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.01235v2-abstract-short" style="display: inline;"> Face Restoration (FR) is a crucial area within image and video processing, focusing on reconstructing high-quality portraits from degraded inputs. Despite advancements in image FR, video FR remains relatively under-explored, primarily due to challenges related to temporal consistency, motion artifacts, and the limited availability of high-quality video data. Moreover, traditional face restoration… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.01235v2-abstract-full').style.display = 'inline'; document.getElementById('2501.01235v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.01235v2-abstract-full" style="display: none;"> Face Restoration (FR) is a crucial area within image and video processing, focusing on reconstructing high-quality portraits from degraded inputs. Despite advancements in image FR, video FR remains relatively under-explored, primarily due to challenges related to temporal consistency, motion artifacts, and the limited availability of high-quality video data. Moreover, traditional face restoration typically prioritizes enhancing resolution and may not give as much consideration to related tasks such as facial colorization and inpainting. In this paper, we propose a novel approach for the Generalized Video Face Restoration (GVFR) task, which integrates video BFR, inpainting, and colorization tasks that we empirically show to benefit each other. We present a unified framework, termed as stable video face restoration (SVFR), which leverages the generative and motion priors of Stable Video Diffusion (SVD) and incorporates task-specific information through a unified face restoration framework. A learnable task embedding is introduced to enhance task identification. Meanwhile, a novel Unified Latent Regularization (ULR) is employed to encourage the shared feature representation learning among different subtasks. To further enhance the restoration quality and temporal stability, we introduce the facial prior learning and the self-referred refinement as auxiliary strategies used for both training and inference. The proposed framework effectively combines the complementary strengths of these tasks, enhancing temporal coherence and achieving superior restoration quality. This work advances the state-of-the-art in video FR and establishes a new paradigm for generalized video face restoration. Code and video demo are available at https://github.com/wangzhiyaoo/SVFR.git. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.01235v2-abstract-full').style.display = 'none'; document.getElementById('2501.01235v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.01108">arXiv:2501.01108</a> <span> [<a href="https://arxiv.org/pdf/2501.01108">pdf</a>, <a href="https://arxiv.org/format/2501.01108">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> MuQ: Self-Supervised Music Representation Learning with Mel Residual Vector Quantization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhu%2C+H">Haina Zhu</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+Y">Yizhi Zhou</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+H">Hangting Chen</a>, <a href="/search/eess?searchtype=author&query=Yu%2C+J">Jianwei Yu</a>, <a href="/search/eess?searchtype=author&query=Ma%2C+Z">Ziyang Ma</a>, <a href="/search/eess?searchtype=author&query=Gu%2C+R">Rongzhi Gu</a>, <a href="/search/eess?searchtype=author&query=Luo%2C+Y">Yi Luo</a>, <a href="/search/eess?searchtype=author&query=Tan%2C+W">Wei Tan</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+X">Xie Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.01108v2-abstract-short" style="display: inline;"> Recent years have witnessed the success of foundation models pre-trained with self-supervised learning (SSL) in various music informatics understanding tasks, including music tagging, instrument classification, key detection, and more. In this paper, we propose a self-supervised music representation learning model for music understanding. Distinguished from previous studies adopting random project… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.01108v2-abstract-full').style.display = 'inline'; document.getElementById('2501.01108v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.01108v2-abstract-full" style="display: none;"> Recent years have witnessed the success of foundation models pre-trained with self-supervised learning (SSL) in various music informatics understanding tasks, including music tagging, instrument classification, key detection, and more. In this paper, we propose a self-supervised music representation learning model for music understanding. Distinguished from previous studies adopting random projection or existing neural codec, the proposed model, named MuQ, is trained to predict tokens generated by Mel Residual Vector Quantization (Mel-RVQ). Our Mel-RVQ utilizes residual linear projection structure for Mel spectrum quantization to enhance the stability and efficiency of target extraction and lead to better performance. Experiments in a large variety of downstream tasks demonstrate that MuQ outperforms previous self-supervised music representation models with only 0.9K hours of open-source pre-training data. Scaling up the data to over 160K hours and adopting iterative training consistently improve the model performance. To further validate the strength of our model, we present MuQ-MuLan, a joint music-text embedding model based on contrastive learning, which achieves state-of-the-art performance in the zero-shot music tagging task on the MagnaTagATune dataset. Code and checkpoints are open source in https://github.com/tencent-ailab/MuQ. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.01108v2-abstract-full').style.display = 'none'; document.getElementById('2501.01108v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.16175">arXiv:2412.16175</a> <span> [<a href="https://arxiv.org/pdf/2412.16175">pdf</a>, <a href="https://arxiv.org/format/2412.16175">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Portfolio Management">q-fin.PM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> </div> </div> <p class="title is-5 mathjax"> Mean--Variance Portfolio Selection by Continuous-Time Reinforcement Learning: Algorithms, Regret Analysis, and Empirical Study </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Huang%2C+Y">Yilie Huang</a>, <a href="/search/eess?searchtype=author&query=Jia%2C+Y">Yanwei Jia</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+X+Y">Xun Yu Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.16175v1-abstract-short" style="display: inline;"> We study continuous-time mean--variance portfolio selection in markets where stock prices are diffusion processes driven by observable factors that are also diffusion processes yet the coefficients of these processes are unknown. Based on the recently developed reinforcement learning (RL) theory for diffusion processes, we present a general data-driven RL algorithm that learns the pre-committed in… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16175v1-abstract-full').style.display = 'inline'; document.getElementById('2412.16175v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.16175v1-abstract-full" style="display: none;"> We study continuous-time mean--variance portfolio selection in markets where stock prices are diffusion processes driven by observable factors that are also diffusion processes yet the coefficients of these processes are unknown. Based on the recently developed reinforcement learning (RL) theory for diffusion processes, we present a general data-driven RL algorithm that learns the pre-committed investment strategy directly without attempting to learn or estimate the market coefficients. For multi-stock Black--Scholes markets without factors, we further devise a baseline algorithm and prove its performance guarantee by deriving a sublinear regret bound in terms of Sharpe ratio. For performance enhancement and practical implementation, we modify the baseline algorithm into four variants, and carry out an extensive empirical study to compare their performance, in terms of a host of common metrics, with a large number of widely used portfolio allocation strategies on S\&P 500 constituents. The results demonstrate that the continuous-time RL strategies are consistently among the best especially in a volatile bear market, and decisively outperform the model-based continuous-time counterparts by significant margins. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16175v1-abstract-full').style.display = 'none'; document.getElementById('2412.16175v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">76 pages, 5 figures, 7 tables</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68T05; 91G10; 68Q25; 93E35; 93E20 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.16085">arXiv:2412.16085</a> <span> [<a href="https://arxiv.org/pdf/2412.16085">pdf</a>, <a href="https://arxiv.org/format/2412.16085">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Efficient MedSAMs: Segment Anything in Medical Images on Laptop </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ma%2C+J">Jun Ma</a>, <a href="/search/eess?searchtype=author&query=Li%2C+F">Feifei Li</a>, <a href="/search/eess?searchtype=author&query=Kim%2C+S">Sumin Kim</a>, <a href="/search/eess?searchtype=author&query=Asakereh%2C+R">Reza Asakereh</a>, <a href="/search/eess?searchtype=author&query=Le%2C+B">Bao-Hiep Le</a>, <a href="/search/eess?searchtype=author&query=Nguyen-Vu%2C+D">Dang-Khoa Nguyen-Vu</a>, <a href="/search/eess?searchtype=author&query=Pfefferle%2C+A">Alexander Pfefferle</a>, <a href="/search/eess?searchtype=author&query=Wei%2C+M">Muxin Wei</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+R">Ruochen Gao</a>, <a href="/search/eess?searchtype=author&query=Lyu%2C+D">Donghang Lyu</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+S">Songxiao Yang</a>, <a href="/search/eess?searchtype=author&query=Purucker%2C+L">Lennart Purucker</a>, <a href="/search/eess?searchtype=author&query=Marinov%2C+Z">Zdravko Marinov</a>, <a href="/search/eess?searchtype=author&query=Staring%2C+M">Marius Staring</a>, <a href="/search/eess?searchtype=author&query=Lu%2C+H">Haisheng Lu</a>, <a href="/search/eess?searchtype=author&query=Dao%2C+T+T">Thuy Thanh Dao</a>, <a href="/search/eess?searchtype=author&query=Ye%2C+X">Xincheng Ye</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Z">Zhi Li</a>, <a href="/search/eess?searchtype=author&query=Brugnara%2C+G">Gianluca Brugnara</a>, <a href="/search/eess?searchtype=author&query=Vollmuth%2C+P">Philipp Vollmuth</a>, <a href="/search/eess?searchtype=author&query=Foltyn-Dumitru%2C+M">Martha Foltyn-Dumitru</a>, <a href="/search/eess?searchtype=author&query=Cho%2C+J">Jaeyoung Cho</a>, <a href="/search/eess?searchtype=author&query=Mahmutoglu%2C+M+A">Mustafa Ahmed Mahmutoglu</a>, <a href="/search/eess?searchtype=author&query=Bendszus%2C+M">Martin Bendszus</a>, <a href="/search/eess?searchtype=author&query=Pfl%C3%BCger%2C+I">Irada Pfl眉ger</a> , et al. (57 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.16085v1-abstract-short" style="display: inline;"> Promptable segmentation foundation models have emerged as a transformative approach to addressing the diverse needs in medical images, but most existing models require expensive computing, posing a big barrier to their adoption in clinical practice. In this work, we organized the first international competition dedicated to promptable medical image segmentation, featuring a large-scale dataset spa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16085v1-abstract-full').style.display = 'inline'; document.getElementById('2412.16085v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.16085v1-abstract-full" style="display: none;"> Promptable segmentation foundation models have emerged as a transformative approach to addressing the diverse needs in medical images, but most existing models require expensive computing, posing a big barrier to their adoption in clinical practice. In this work, we organized the first international competition dedicated to promptable medical image segmentation, featuring a large-scale dataset spanning nine common imaging modalities from over 20 different institutions. The top teams developed lightweight segmentation foundation models and implemented an efficient inference pipeline that substantially reduced computational requirements while maintaining state-of-the-art segmentation accuracy. Moreover, the post-challenge phase advanced the algorithms through the design of performance booster and reproducibility tasks, resulting in improved algorithms and validated reproducibility of the winning solution. Furthermore, the best-performing algorithms have been incorporated into the open-source software with a user-friendly interface to facilitate clinical adoption. The data and code are publicly available to foster the further development of medical image segmentation foundation models and pave the way for impactful real-world applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16085v1-abstract-full').style.display = 'none'; document.getElementById('2412.16085v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CVPR 2024 MedSAM on Laptop Competition Summary: https://www.codabench.org/competitions/1847/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.13786">arXiv:2412.13786</a> <span> [<a href="https://arxiv.org/pdf/2412.13786">pdf</a>, <a href="https://arxiv.org/format/2412.13786">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> SongEditor: Adapting Zero-Shot Song Generation Language Model as a Multi-Task Editor </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yang%2C+C">Chenyu Yang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+S">Shuai Wang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+H">Hangting Chen</a>, <a href="/search/eess?searchtype=author&query=Yu%2C+J">Jianwei Yu</a>, <a href="/search/eess?searchtype=author&query=Tan%2C+W">Wei Tan</a>, <a href="/search/eess?searchtype=author&query=Gu%2C+R">Rongzhi Gu</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+Y">Yaoxun Xu</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+Y">Yizhi Zhou</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+H">Haina Zhu</a>, <a href="/search/eess?searchtype=author&query=Li%2C+H">Haizhou Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.13786v2-abstract-short" style="display: inline;"> The emergence of novel generative modeling paradigms, particularly audio language models, has significantly advanced the field of song generation. Although state-of-the-art models are capable of synthesizing both vocals and accompaniment tracks up to several minutes long concurrently, research about partial adjustments or editing of existing songs is still underexplored, which allows for more flex… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13786v2-abstract-full').style.display = 'inline'; document.getElementById('2412.13786v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.13786v2-abstract-full" style="display: none;"> The emergence of novel generative modeling paradigms, particularly audio language models, has significantly advanced the field of song generation. Although state-of-the-art models are capable of synthesizing both vocals and accompaniment tracks up to several minutes long concurrently, research about partial adjustments or editing of existing songs is still underexplored, which allows for more flexible and effective production. In this paper, we present SongEditor, the first song editing paradigm that introduces the editing capabilities into language-modeling song generation approaches, facilitating both segment-wise and track-wise modifications. SongEditor offers the flexibility to adjust lyrics, vocals, and accompaniments, as well as synthesizing songs from scratch. The core components of SongEditor include a music tokenizer, an autoregressive language model, and a diffusion generator, enabling generating an entire section, masked lyrics, or even separated vocals and background music. Extensive experiments demonstrate that the proposed SongEditor achieves exceptional performance in end-to-end song editing, as evidenced by both objective and subjective metrics. Audio samples are available in https://cypress-yang.github.io/SongEditor_demo/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13786v2-abstract-full').style.display = 'none'; document.getElementById('2412.13786v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by AAAI2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.12126">arXiv:2412.12126</a> <span> [<a href="https://arxiv.org/pdf/2412.12126">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Seamless Optical Cloud Computing across Edge-Metro Network for Generative AI </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Xing%2C+S">Sizhe Xing</a>, <a href="/search/eess?searchtype=author&query=Sun%2C+A">Aolong Sun</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+C">Chengxi Wang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yizhi Wang</a>, <a href="/search/eess?searchtype=author&query=Dong%2C+B">Boyu Dong</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+J">Junhui Hu</a>, <a href="/search/eess?searchtype=author&query=Deng%2C+X">Xuyu Deng</a>, <a href="/search/eess?searchtype=author&query=Yan%2C+A">An Yan</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+Y">Yingjun Liu</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+F">Fangchen Hu</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Z">Zhongya Li</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+O">Ouhan Huang</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+J">Junhao Zhao</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+Y">Yingjun Zhou</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Z">Ziwei Li</a>, <a href="/search/eess?searchtype=author&query=Shi%2C+J">Jianyang Shi</a>, <a href="/search/eess?searchtype=author&query=Xiao%2C+X">Xi Xiao</a>, <a href="/search/eess?searchtype=author&query=Penty%2C+R">Richard Penty</a>, <a href="/search/eess?searchtype=author&query=Cheng%2C+Q">Qixiang Cheng</a>, <a href="/search/eess?searchtype=author&query=Chi%2C+N">Nan Chi</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+J">Junwen Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.12126v1-abstract-short" style="display: inline;"> The rapid advancement of generative artificial intelligence (AI) in recent years has profoundly reshaped modern lifestyles, necessitating a revolutionary architecture to support the growing demands for computational power. Cloud computing has become the driving force behind this transformation. However, it consumes significant power and faces computation security risks due to the reliance on exten… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.12126v1-abstract-full').style.display = 'inline'; document.getElementById('2412.12126v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.12126v1-abstract-full" style="display: none;"> The rapid advancement of generative artificial intelligence (AI) in recent years has profoundly reshaped modern lifestyles, necessitating a revolutionary architecture to support the growing demands for computational power. Cloud computing has become the driving force behind this transformation. However, it consumes significant power and faces computation security risks due to the reliance on extensive data centers and servers in the cloud. Reducing power consumption while enhancing computational scale remains persistent challenges in cloud computing. Here, we propose and experimentally demonstrate an optical cloud computing system that can be seamlessly deployed across edge-metro network. By modulating inputs and models into light, a wide range of edge nodes can directly access the optical computing center via the edge-metro network. The experimental validations show an energy efficiency of 118.6 mW/TOPs (tera operations per second), reducing energy consumption by two orders of magnitude compared to traditional electronic-based cloud computing solutions. Furthermore, it is experimentally validated that this architecture can perform various complex generative AI models through parallel computing to achieve image generation tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.12126v1-abstract-full').style.display = 'none'; document.getElementById('2412.12126v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.07236">arXiv:2412.07236</a> <span> [<a href="https://arxiv.org/pdf/2412.07236">pdf</a>, <a href="https://arxiv.org/format/2412.07236">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neurons and Cognition">q-bio.NC</span> </div> </div> <p class="title is-5 mathjax"> CBraMod: A Criss-Cross Brain Foundation Model for EEG Decoding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+J">Jiquan Wang</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+S">Sha Zhao</a>, <a href="/search/eess?searchtype=author&query=Luo%2C+Z">Zhiling Luo</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+Y">Yangxuan Zhou</a>, <a href="/search/eess?searchtype=author&query=Jiang%2C+H">Haiteng Jiang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+S">Shijian Li</a>, <a href="/search/eess?searchtype=author&query=Li%2C+T">Tao Li</a>, <a href="/search/eess?searchtype=author&query=Pan%2C+G">Gang Pan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.07236v2-abstract-short" style="display: inline;"> Electroencephalography (EEG) is a non-invasive technique to measure and record brain electrical activity, widely used in various BCI and healthcare applications. Early EEG decoding methods rely on supervised learning, limited by specific tasks and datasets, hindering model performance and generalizability. With the success of large language models, there is a growing body of studies focusing on EE… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07236v2-abstract-full').style.display = 'inline'; document.getElementById('2412.07236v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.07236v2-abstract-full" style="display: none;"> Electroencephalography (EEG) is a non-invasive technique to measure and record brain electrical activity, widely used in various BCI and healthcare applications. Early EEG decoding methods rely on supervised learning, limited by specific tasks and datasets, hindering model performance and generalizability. With the success of large language models, there is a growing body of studies focusing on EEG foundation models. However, these studies still leave challenges: Firstly, most of existing EEG foundation models employ full EEG modeling strategy. It models the spatial and temporal dependencies between all EEG patches together, but ignores that the spatial and temporal dependencies are heterogeneous due to the unique structural characteristics of EEG signals. Secondly, existing EEG foundation models have limited generalizability on a wide range of downstream BCI tasks due to varying formats of EEG data, making it challenging to adapt to. To address these challenges, we propose a novel foundation model called CBraMod. Specifically, we devise a criss-cross transformer as the backbone to thoroughly leverage the structural characteristics of EEG signals, which can model spatial and temporal dependencies separately through two parallel attention mechanisms. And we utilize an asymmetric conditional positional encoding scheme which can encode positional information of EEG patches and be easily adapted to the EEG with diverse formats. CBraMod is pre-trained on a very large corpus of EEG through patch-based masked EEG reconstruction. We evaluate CBraMod on up to 10 downstream BCI tasks (12 public datasets). CBraMod achieves the state-of-the-art performance across the wide range of tasks, proving its strong capability and generalizability. The source code is publicly available at https://github.com/wjq-learning/CBraMod. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07236v2-abstract-full').style.display = 'none'; document.getElementById('2412.07236v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by The Thirteenth International Conference on Learning Representations (ICLR 2025)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.06581">arXiv:2412.06581</a> <span>  </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> EmoSpeech: A Corpus of Emotionally Rich and Contextually Detailed Speech Annotations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Bian%2C+W">Weizhen Bian</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+Y">Yubo Zhou</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+K">Kaitai Zhang</a>, <a href="/search/eess?searchtype=author&query=Gu%2C+X">Xiaohan Gu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.06581v3-abstract-short" style="display: inline;"> Advances in text-to-speech (TTS) technology have significantly improved the quality of generated speech, closely matching the timbre and intonation of the target speaker. However, due to the inherent complexity of human emotional expression, the development of TTS systems capable of controlling subtle emotional differences remains a formidable challenge. Existing emotional speech databases often s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06581v3-abstract-full').style.display = 'inline'; document.getElementById('2412.06581v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.06581v3-abstract-full" style="display: none;"> Advances in text-to-speech (TTS) technology have significantly improved the quality of generated speech, closely matching the timbre and intonation of the target speaker. However, due to the inherent complexity of human emotional expression, the development of TTS systems capable of controlling subtle emotional differences remains a formidable challenge. Existing emotional speech databases often suffer from overly simplistic labelling schemes that fail to capture a wide range of emotional states, thus limiting the effectiveness of emotion synthesis in TTS applications. To this end, recent efforts have focussed on building databases that use natural language annotations to describe speech emotions. However, these approaches are costly and require more emotional depth to train robust systems. In this paper, we propose a novel process aimed at building databases by systematically extracting emotion-rich speech segments and annotating them with detailed natural language descriptions through a generative model. This approach enhances the emotional granularity of the database and significantly reduces the reliance on costly manual annotations by automatically augmenting the data with high-level language models. The resulting rich database provides a scalable and economically viable solution for developing a more nuanced and dynamic basis for developing emotionally controlled TTS systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06581v3-abstract-full').style.display = 'none'; document.getElementById('2412.06581v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">I did not obtain the necessary approval from my academic supervisor prior to submission and there are issues with my current paper</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.06507">arXiv:2412.06507</a> <span> [<a href="https://arxiv.org/pdf/2412.06507">pdf</a>, <a href="https://arxiv.org/format/2412.06507">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> BATseg: Boundary-aware Multiclass Spinal Cord Tumor Segmentation on 3D MRI Scans </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Song%2C+H">Hongkang Song</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Z">Zihui Zhang</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+Y">Yanpeng Zhou</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+J">Jie Hu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Z">Zishuo Wang</a>, <a href="/search/eess?searchtype=author&query=Chan%2C+H+H">Hou Him Chan</a>, <a href="/search/eess?searchtype=author&query=Lei%2C+C+L">Chon Lok Lei</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+C">Chen Xu</a>, <a href="/search/eess?searchtype=author&query=Xin%2C+Y">Yu Xin</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+B">Bo Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.06507v1-abstract-short" style="display: inline;"> Spinal cord tumors significantly contribute to neurological morbidity and mortality. Precise morphometric quantification, encompassing the size, location, and type of such tumors, holds promise for optimizing treatment planning strategies. Although recent methods have demonstrated excellent performance in medical image segmentation, they primarily focus on discerning shapes with relatively large m… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06507v1-abstract-full').style.display = 'inline'; document.getElementById('2412.06507v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.06507v1-abstract-full" style="display: none;"> Spinal cord tumors significantly contribute to neurological morbidity and mortality. Precise morphometric quantification, encompassing the size, location, and type of such tumors, holds promise for optimizing treatment planning strategies. Although recent methods have demonstrated excellent performance in medical image segmentation, they primarily focus on discerning shapes with relatively large morphology such as brain tumors, ignoring the challenging problem of identifying spinal cord tumors which tend to have tiny sizes, diverse locations, and shapes. To tackle this hard problem of multiclass spinal cord tumor segmentation, we propose a new method, called BATseg, to learn a tumor surface distance field by applying our new multiclass boundary-aware loss function. To verify the effectiveness of our approach, we also introduce the first and large-scale spinal cord tumor dataset. It comprises gadolinium-enhanced T1-weighted 3D MRI scans from 653 patients and contains the four most common spinal cord tumor types: astrocytomas, ependymomas, hemangioblastomas, and spinal meningiomas. Extensive experiments on our dataset and another public kidney tumor segmentation dataset show that our proposed method achieves superior performance for multiclass tumor segmentation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06507v1-abstract-full').style.display = 'none'; document.getElementById('2412.06507v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ECCV 2024 Workshop on BioImage Computing. Code and data are available at: https://github.com/vLAR-group/BATseg</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.04266">arXiv:2412.04266</a> <span> [<a href="https://arxiv.org/pdf/2412.04266">pdf</a>, <a href="https://arxiv.org/format/2412.04266">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Representation Purification for End-to-End Speech Translation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+C">Chengwei Zhang</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+Y">Yue Zhou</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+R">Rui Zhao</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yidong Chen</a>, <a href="/search/eess?searchtype=author&query=Shi%2C+X">Xiaodong Shi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.04266v1-abstract-short" style="display: inline;"> Speech-to-text translation (ST) is a cross-modal task that involves converting spoken language into text in a different language. Previous research primarily focused on enhancing speech translation by facilitating knowledge transfer from machine translation, exploring various methods to bridge the gap between speech and text modalities. Despite substantial progress made, factors in speech that are… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.04266v1-abstract-full').style.display = 'inline'; document.getElementById('2412.04266v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.04266v1-abstract-full" style="display: none;"> Speech-to-text translation (ST) is a cross-modal task that involves converting spoken language into text in a different language. Previous research primarily focused on enhancing speech translation by facilitating knowledge transfer from machine translation, exploring various methods to bridge the gap between speech and text modalities. Despite substantial progress made, factors in speech that are not relevant to translation content, such as timbre and rhythm, often limit the efficiency of knowledge transfer. In this paper, we conceptualize speech representation as a combination of content-agnostic and content-relevant factors. We examine the impact of content-agnostic factors on translation performance through preliminary experiments and observe a significant performance deterioration when content-agnostic perturbations are introduced to speech signals. To address this issue, we propose a \textbf{S}peech \textbf{R}epresentation \textbf{P}urification with \textbf{S}upervision \textbf{E}nhancement (SRPSE) framework, which excludes the content-agnostic components within speech representations to mitigate their negative impact on ST. Experiments on MuST-C and CoVoST-2 datasets demonstrate that SRPSE significantly improves translation performance across all translation directions in three settings and achieves preeminent performance under a \textit{transcript-free} setting. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.04266v1-abstract-full').style.display = 'none'; document.getElementById('2412.04266v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by COLING 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.03279">arXiv:2412.03279</a> <span> [<a href="https://arxiv.org/pdf/2412.03279">pdf</a>, <a href="https://arxiv.org/format/2412.03279">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Rotograb: Combining Biomimetic Hands with Industrial Grippers using a Rotating Thumb </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Bersier%2C+A">Arnaud Bersier</a>, <a href="/search/eess?searchtype=author&query=Leonforte%2C+M">Matteo Leonforte</a>, <a href="/search/eess?searchtype=author&query=Vanetta%2C+A">Alessio Vanetta</a>, <a href="/search/eess?searchtype=author&query=Wotke%2C+S+L+A">Sarah Lia Andrea Wotke</a>, <a href="/search/eess?searchtype=author&query=Nappi%2C+A">Andrea Nappi</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+Y">Yifan Zhou</a>, <a href="/search/eess?searchtype=author&query=Oliani%2C+S">Sebastiano Oliani</a>, <a href="/search/eess?searchtype=author&query=K%C3%BCbler%2C+A+M">Alexander M. K眉bler</a>, <a href="/search/eess?searchtype=author&query=Katzschmann%2C+R+K">Robert K. Katzschmann</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.03279v1-abstract-short" style="display: inline;"> The development of robotic grippers and hands for automation aims to emulate human dexterity without sacrificing the efficiency of industrial grippers. This study introduces Rotograb, a tendon-actuated robotic hand featuring a novel rotating thumb. The aim is to combine the dexterity of human hands with the efficiency of industrial grippers. The rotating thumb enlarges the workspace and allows in-… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.03279v1-abstract-full').style.display = 'inline'; document.getElementById('2412.03279v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.03279v1-abstract-full" style="display: none;"> The development of robotic grippers and hands for automation aims to emulate human dexterity without sacrificing the efficiency of industrial grippers. This study introduces Rotograb, a tendon-actuated robotic hand featuring a novel rotating thumb. The aim is to combine the dexterity of human hands with the efficiency of industrial grippers. The rotating thumb enlarges the workspace and allows in-hand manipulation. A novel joint design minimizes movement interference and simplifies kinematics, using a cutout for tendon routing. We integrate teleoperation, using a depth camera for real-time tracking and autonomous manipulation powered by reinforcement learning with proximal policy optimization. Experimental evaluations demonstrate that Rotograb's rotating thumb greatly improves both operational versatility and workspace. It can handle various grasping and manipulation tasks with objects from the YCB dataset, with particularly good results when rotating objects within its grasp. Rotograb represents a notable step towards bridging the capability gap between human hands and industrial grippers. The tendon-routing and thumb-rotating mechanisms allow for a new level of control and dexterity. Integrating teleoperation and autonomous learning underscores Rotograb's adaptability and sophistication, promising substantial advancements in both robotics research and practical applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.03279v1-abstract-full').style.display = 'none'; document.getElementById('2412.03279v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> 2024 International Conference on Intelligent Robots and Systems </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.01100">arXiv:2412.01100</a> <span> [<a href="https://arxiv.org/pdf/2412.01100">pdf</a>, <a href="https://arxiv.org/format/2412.01100">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> The Codec Language Model-based Zero-Shot Spontaneous Style TTS System for CoVoC Challenge 2024 </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhou%2C+S">Shuoyi Zhou</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+Y">Yixuan Zhou</a>, <a href="/search/eess?searchtype=author&query=Li%2C+W">Weiqin Li</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+J">Jun Chen</a>, <a href="/search/eess?searchtype=author&query=Ye%2C+R">Runchuan Ye</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+W">Weihao Wu</a>, <a href="/search/eess?searchtype=author&query=Lin%2C+Z">Zijian Lin</a>, <a href="/search/eess?searchtype=author&query=Lei%2C+S">Shun Lei</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+Z">Zhiyong Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.01100v2-abstract-short" style="display: inline;"> This paper describes the zero-shot spontaneous style TTS system for the ISCSLP 2024 Conversational Voice Clone Challenge (CoVoC). We propose a LLaMA-based codec language model with a delay pattern to achieve spontaneous style voice cloning. To improve speech intelligibility, we introduce the Classifier-Free Guidance (CFG) strategy in the language model to strengthen conditional guidance on token p… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.01100v2-abstract-full').style.display = 'inline'; document.getElementById('2412.01100v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.01100v2-abstract-full" style="display: none;"> This paper describes the zero-shot spontaneous style TTS system for the ISCSLP 2024 Conversational Voice Clone Challenge (CoVoC). We propose a LLaMA-based codec language model with a delay pattern to achieve spontaneous style voice cloning. To improve speech intelligibility, we introduce the Classifier-Free Guidance (CFG) strategy in the language model to strengthen conditional guidance on token prediction. To generate high-quality utterances, we adopt effective data preprocessing operations and fine-tune our model with selected high-quality spontaneous speech data. The official evaluations in the CoVoC constrained track show that our system achieves the best speech naturalness MOS of 3.80 and obtains considerable speech quality and speaker similarity results. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.01100v2-abstract-full').style.display = 'none'; document.getElementById('2412.01100v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ISCSLP 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.18290">arXiv:2411.18290</a> <span> [<a href="https://arxiv.org/pdf/2411.18290">pdf</a>, <a href="https://arxiv.org/format/2411.18290">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Leveraging Semantic Asymmetry for Precise Gross Tumor Volume Segmentation of Nasopharyngeal Carcinoma in Planning CT </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+Z">Zi Li</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Ying Chen</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Z">Zeli Chen</a>, <a href="/search/eess?searchtype=author&query=Su%2C+Y">Yanzhou Su</a>, <a href="/search/eess?searchtype=author&query=Ma%2C+T">Tai Ma</a>, <a href="/search/eess?searchtype=author&query=Mok%2C+T+C+W">Tony C. W. Mok</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+Y">Yan-Jie Zhou</a>, <a href="/search/eess?searchtype=author&query=Bai%2C+Y">Yunhai Bai</a>, <a href="/search/eess?searchtype=author&query=Zheng%2C+Z">Zhinlin Zheng</a>, <a href="/search/eess?searchtype=author&query=Lu%2C+L">Le Lu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yirui Wang</a>, <a href="/search/eess?searchtype=author&query=Ge%2C+J">Jia Ge</a>, <a href="/search/eess?searchtype=author&query=Ye%2C+X">Xianghua Ye</a>, <a href="/search/eess?searchtype=author&query=Yan%2C+S">Senxiang Yan</a>, <a href="/search/eess?searchtype=author&query=Jin%2C+D">Dakai Jin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.18290v2-abstract-short" style="display: inline;"> In the radiation therapy of nasopharyngeal carcinoma (NPC), clinicians typically delineate the gross tumor volume (GTV) using non-contrast planning computed tomography to ensure accurate radiation dose delivery. However, the low contrast between tumors and adjacent normal tissues necessitates that radiation oncologists manually delineate the tumors, often relying on diagnostic MRI for guidance. %… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18290v2-abstract-full').style.display = 'inline'; document.getElementById('2411.18290v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.18290v2-abstract-full" style="display: none;"> In the radiation therapy of nasopharyngeal carcinoma (NPC), clinicians typically delineate the gross tumor volume (GTV) using non-contrast planning computed tomography to ensure accurate radiation dose delivery. However, the low contrast between tumors and adjacent normal tissues necessitates that radiation oncologists manually delineate the tumors, often relying on diagnostic MRI for guidance. % In this study, we propose a novel approach to directly segment NPC gross tumors on non-contrast planning CT images, circumventing potential registration errors when aligning MRI or MRI-derived tumor masks to planning CT. To address the low contrast issues between tumors and adjacent normal structures in planning CT, we introduce a 3D Semantic Asymmetry Tumor segmentation (SATs) method. Specifically, we posit that a healthy nasopharyngeal region is characteristically bilaterally symmetric, whereas the emergence of nasopharyngeal carcinoma disrupts this symmetry. Then, we propose a Siamese contrastive learning segmentation framework that minimizes the voxel-wise distance between original and flipped areas without tumor and encourages a larger distance between original and flipped areas with tumor. Thus, our approach enhances the sensitivity of features to semantic asymmetries. % Extensive experiments demonstrate that the proposed SATs achieves the leading NPC GTV segmentation performance in both internal and external testing, \emph{e.g.}, with at least 2\% absolute Dice score improvement and 12\% average distance error reduction when compared to other state-of-the-art methods in the external testing. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18290v2-abstract-full').style.display = 'none'; document.getElementById('2411.18290v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.16937">arXiv:2411.16937</a> <span> [<a href="https://arxiv.org/pdf/2411.16937">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Traffic Wave Properties for Automated Vehicles During Traffic Oscillations via Analytical Approximations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhou%2C+Y">Yang Zhou</a>, <a href="/search/eess?searchtype=author&query=Li%2C+S">Sixu Li</a>, <a href="/search/eess?searchtype=author&query=Kontar%2C+W">Wissam Kontar</a>, <a href="/search/eess?searchtype=author&query=Pu%2C+F">Fan Pu</a>, <a href="/search/eess?searchtype=author&query=Srivastava%2C+A">Anupam Srivastava</a>, <a href="/search/eess?searchtype=author&query=Ahn%2C+S">Soyoung Ahn</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.16937v1-abstract-short" style="display: inline;"> This paper presents an analytical approximation framework to understand the dynamics of traffic wave propagation for Automated Vehicles (AVs) during traffic oscillations. The framework systematically unravels the intricate relationships between the longitudinal control model of the AVs and the properties of traffic waves. We apply Laplacian Transformation and Describing Function Analysis to mathem… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16937v1-abstract-full').style.display = 'inline'; document.getElementById('2411.16937v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.16937v1-abstract-full" style="display: none;"> This paper presents an analytical approximation framework to understand the dynamics of traffic wave propagation for Automated Vehicles (AVs) during traffic oscillations. The framework systematically unravels the intricate relationships between the longitudinal control model of the AVs and the properties of traffic waves. We apply Laplacian Transformation and Describing Function Analysis to mathematically derive the traffic wave properties of an AV in car-following scenarios. Further, we incorporate Newell's car-following model to determine the speed of the traffic waves. Our analysis extends to both homogenous and heterogenous traffic, systematically handling intra-heterogeneities and inter-heterogeneities in traffic wave propagation using the established analytical framework. We validate our approach via numerical simulations and show the connections between the AV control system and traffic wave properties. This research emphasizes the importance of rethinking our understanding of traffic wave properties when AVs are present in the traffic system. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16937v1-abstract-full').style.display = 'none'; document.getElementById('2411.16937v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13456">arXiv:2411.13456</a> <span> [<a href="https://arxiv.org/pdf/2411.13456">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Why Anticipatory Sensing Matters in Commercial ACC Systems under Cut-In Scenarios: A Perspective from Stochastic Safety Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+H">Hao Zhang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+S">Sixu Li</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Z">Zihao Li</a>, <a href="/search/eess?searchtype=author&query=Anis%2C+M">Mohammad Anis</a>, <a href="/search/eess?searchtype=author&query=Lord%2C+D">Dominique Lord</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+Y">Yang Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13456v1-abstract-short" style="display: inline;"> This study presents an analytical solution for the vehicle state evolution of Adaptive Cruise Control (ACC) systems under cut-in scenarios, incorporating sensing delays and anticipation using the Lambert W function. The theoretical analysis demonstrates that the vehicle state evolution and the corresponding safety of ACC in cut-in situations are influenced by multiple factors, including the origin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13456v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13456v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13456v1-abstract-full" style="display: none;"> This study presents an analytical solution for the vehicle state evolution of Adaptive Cruise Control (ACC) systems under cut-in scenarios, incorporating sensing delays and anticipation using the Lambert W function. The theoretical analysis demonstrates that the vehicle state evolution and the corresponding safety of ACC in cut-in situations are influenced by multiple factors, including the original leading vehicle's state, the initial conditions of the cut-in vehicle, subsequent cut-in maneuvers, sensing delays, and the ACC's anticipation capabilities. To quantitatively assess these influences, a series of numerical experiments were conducted to perform a stochastic safety analysis of ACC systems, accounting for embedded sensing delays and anticipation, using empirically calibrated control parameters from real-world data. The experiments revealed that the impact of sensing delays on ACC is multifaceted. Specifically, sensing delays negatively affect ACC stability, with the severity increasing as the delay lengthens. Furthermore, collision risk in cut-in scenarios becomes more significant with sensing delays, particularly when the cut-in vehicle is slower than the following vehicle and when cut-ins are aggressive. However, anticipation plays a crucial role in mitigating these risks. Even with a 0.6-second anticipation, collision risk can be reduced by 91% in highly adverse scenarios. Finally, both sensing delays and anticipation have effects that intensify with their duration. An anticipation period of 2 seconds effectively ensures safety in aggressive cut-in conditions, even in the presence of sensing delays. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13456v1-abstract-full').style.display = 'none'; document.getElementById('2411.13456v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11980">arXiv:2411.11980</a> <span> [<a href="https://arxiv.org/pdf/2411.11980">pdf</a>, <a href="https://arxiv.org/format/2411.11980">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Transmission Line Outage Probability Prediction Under Extreme Events Using Peter-Clark Bayesian Structural Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Chen%2C+X">Xiaolin Chen</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+Q">Qiuhua Huang</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+Y">Yuqi Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11980v1-abstract-short" style="display: inline;"> Recent years have seen a notable increase in the frequency and intensity of extreme weather events. With a rising number of power outages caused by these events, accurate prediction of power line outages is essential for safe and reliable operation of power grids. The Bayesian network is a probabilistic model that is very effective for predicting line outages under weather-related uncertainties. H… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11980v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11980v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11980v1-abstract-full" style="display: none;"> Recent years have seen a notable increase in the frequency and intensity of extreme weather events. With a rising number of power outages caused by these events, accurate prediction of power line outages is essential for safe and reliable operation of power grids. The Bayesian network is a probabilistic model that is very effective for predicting line outages under weather-related uncertainties. However, most existing studies in this area offer general risk assessments, but fall short of providing specific outage probabilities. In this work, we introduce a novel approach for predicting transmission line outage probabilities using a Bayesian network combined with Peter-Clark (PC) structural learning. Our approach not only enables precise outage probability calculations, but also demonstrates better scalability and robust performance, even with limited data. Case studies using data from BPA and NOAA show the effectiveness of this approach, while comparisons with several existing methods further highlight its advantages. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11980v1-abstract-full').style.display = 'none'; document.getElementById('2411.11980v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11791">arXiv:2411.11791</a> <span> [<a href="https://arxiv.org/pdf/2411.11791">pdf</a>, <a href="https://arxiv.org/format/2411.11791">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Machine Learning-Assisted Distribution System Network Reconfiguration Problem </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Asiamah%2C+R">Richard Asiamah</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+Y">Yuqi Zhou</a>, <a href="/search/eess?searchtype=author&query=Zamzam%2C+A+S">Ahmed S. Zamzam</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11791v1-abstract-short" style="display: inline;"> High penetration from volatile renewable energy resources in the grid and the varying nature of loads raise the need for frequent line switching to ensure the efficient operation of electrical distribution networks. Operators must ensure maximum load delivery, reduced losses, and the operation between voltage limits. However, computations to decide the optimal feeder configuration are often comput… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11791v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11791v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11791v1-abstract-full" style="display: none;"> High penetration from volatile renewable energy resources in the grid and the varying nature of loads raise the need for frequent line switching to ensure the efficient operation of electrical distribution networks. Operators must ensure maximum load delivery, reduced losses, and the operation between voltage limits. However, computations to decide the optimal feeder configuration are often computationally expensive and intractable, making it unfavorable for real-time operations. This is mainly due to the existence of binary variables in the network reconfiguration optimization problem. To tackle this issue, we have devised an approach that leverages machine learning techniques to reshape distribution networks featuring multiple substations. This involves predicting the substation responsible for serving each part of the network. Hence, it leaves simple and more tractable Optimal Power Flow problems to be solved. This method can produce accurate results in a significantly faster time, as demonstrated using the IEEE 37-bus distribution feeder. Compared to the traditional optimization-based approaches, a feasible solution is achieved approximately ten times faster for all the tested scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11791v1-abstract-full').style.display = 'none'; document.getElementById('2411.11791v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10772">arXiv:2411.10772</a> <span> [<a href="https://arxiv.org/pdf/2411.10772">pdf</a>, <a href="https://arxiv.org/format/2411.10772">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> MRI Parameter Mapping via Gaussian Mixture VAE: Breaking the Assumption of Independent Pixels </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Xu%2C+M">Moucheng Xu</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+Y">Yukun Zhou</a>, <a href="/search/eess?searchtype=author&query=Goodwin-Allcock%2C+T">Tobias Goodwin-Allcock</a>, <a href="/search/eess?searchtype=author&query=Firoozabadi%2C+K">Kimia Firoozabadi</a>, <a href="/search/eess?searchtype=author&query=Jacob%2C+J">Joseph Jacob</a>, <a href="/search/eess?searchtype=author&query=Alexander%2C+D+C">Daniel C. Alexander</a>, <a href="/search/eess?searchtype=author&query=Slator%2C+P+J">Paddy J. Slator</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10772v1-abstract-short" style="display: inline;"> We introduce and demonstrate a new paradigm for quantitative parameter mapping in MRI. Parameter mapping techniques, such as diffusion MRI and quantitative MRI, have the potential to robustly and repeatably measure biologically-relevant tissue maps that strongly relate to underlying microstructure. Quantitative maps are calculated by fitting a model to multiple images, e.g. with least-squares or m… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10772v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10772v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10772v1-abstract-full" style="display: none;"> We introduce and demonstrate a new paradigm for quantitative parameter mapping in MRI. Parameter mapping techniques, such as diffusion MRI and quantitative MRI, have the potential to robustly and repeatably measure biologically-relevant tissue maps that strongly relate to underlying microstructure. Quantitative maps are calculated by fitting a model to multiple images, e.g. with least-squares or machine learning. However, the overwhelming majority of model fitting techniques assume that each voxel is independent, ignoring any co-dependencies in the data. This makes model fitting sensitive to voxelwise measurement noise, hampering reliability and repeatability. We propose a self-supervised deep variational approach that breaks the assumption of independent pixels, leveraging redundancies in the data to effectively perform data-driven regularisation of quantitative maps. We demonstrate that our approach outperforms current model fitting techniques in dMRI simulations and real data. Especially with a Gaussian mixture prior, our model enables sharper quantitative maps, revealing finer anatomical details that are not presented in the baselines. Our approach can hence support the clinical adoption of parameter mapping methods such as dMRI and qMRI. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10772v1-abstract-full').style.display = 'none'; document.getElementById('2411.10772v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2024 Workshop in Machine Learning and the Physical Sciences</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09783">arXiv:2411.09783</a> <span> [<a href="https://arxiv.org/pdf/2411.09783">pdf</a>, <a href="https://arxiv.org/format/2411.09783">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Exploring the Use of Autonomous Unmanned Vehicles for Supporting Power Grid Operations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhou%2C+Y">Yuqi Zhou</a>, <a href="/search/eess?searchtype=author&query=Feng%2C+C">Cong Feng</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+M">Mingzhi Zhang</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+R">Rui Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09783v2-abstract-short" style="display: inline;"> This paper explores the use of autonomous unmanned vehicles to support power grid operations. With built-in batteries and the capability to carry additional battery energy storage, the rising number of autonomous vehicles can represent a substantial amount of capacity that is currently underutilized in the power grid. Unlike traditional electric vehicles that require drivers, the operations of aut… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09783v2-abstract-full').style.display = 'inline'; document.getElementById('2411.09783v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09783v2-abstract-full" style="display: none;"> This paper explores the use of autonomous unmanned vehicles to support power grid operations. With built-in batteries and the capability to carry additional battery energy storage, the rising number of autonomous vehicles can represent a substantial amount of capacity that is currently underutilized in the power grid. Unlike traditional electric vehicles that require drivers, the operations of autonomous vehicles can be performed without human intervention. To guide idle vehicles to autonomously support power grids, we propose a tractable optimization-based method to effectively integrate these "mobile batteries" into grid operations. During real-time operations, the vehicles are strategically routed to target locations to maintain power balance and reduce operating costs. Numerical studies have confirmed both the validity and the scalability of the proposed algorithm to efficiently integrate autonomous vehicles into routine power system operations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09783v2-abstract-full').style.display = 'none'; document.getElementById('2411.09783v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18610">arXiv:2410.18610</a> <span> [<a href="https://arxiv.org/pdf/2410.18610">pdf</a>, <a href="https://arxiv.org/format/2410.18610">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> A Joint Representation Using Continuous and Discrete Features for Cardiovascular Diseases Risk Prediction on Chest CT Scans </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Xu%2C+M">Minfeng Xu</a>, <a href="/search/eess?searchtype=author&query=Fan%2C+C">Chen-Chen Fan</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+Y">Yan-Jie Zhou</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+W">Wenchao Guo</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+P">Pan Liu</a>, <a href="/search/eess?searchtype=author&query=Qi%2C+J">Jing Qi</a>, <a href="/search/eess?searchtype=author&query=Lu%2C+L">Le Lu</a>, <a href="/search/eess?searchtype=author&query=Chao%2C+H">Hanqing Chao</a>, <a href="/search/eess?searchtype=author&query=He%2C+K">Kunlun He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18610v2-abstract-short" style="display: inline;"> Cardiovascular diseases (CVD) remain a leading health concern and contribute significantly to global mortality rates. While clinical advancements have led to a decline in CVD mortality, accurately identifying individuals who could benefit from preventive interventions remains an unsolved challenge in preventive cardiology. Current CVD risk prediction models, recommended by guidelines, are based on… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18610v2-abstract-full').style.display = 'inline'; document.getElementById('2410.18610v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18610v2-abstract-full" style="display: none;"> Cardiovascular diseases (CVD) remain a leading health concern and contribute significantly to global mortality rates. While clinical advancements have led to a decline in CVD mortality, accurately identifying individuals who could benefit from preventive interventions remains an unsolved challenge in preventive cardiology. Current CVD risk prediction models, recommended by guidelines, are based on limited traditional risk factors or use CT imaging to acquire quantitative biomarkers, and still have limitations in predictive accuracy and applicability. On the other hand, end-to-end trained CVD risk prediction methods leveraging deep learning on CT images often fail to provide transparent and explainable decision grounds for assisting physicians. In this work, we proposed a novel joint representation that integrates discrete quantitative biomarkers and continuous deep features extracted from chest CT scans. Our approach initiated with a deep CVD risk classification model by capturing comprehensive continuous deep learning features while jointly obtaining currently clinical-established quantitative biomarkers via segmentation models. In the feature joint representation stage, we use an instance-wise feature-gated mechanism to align the continuous and discrete features, followed by a soft instance-wise feature interaction mechanism fostering independent and effective feature interaction for the final CVD risk prediction. Our method substantially improves CVD risk predictive performance and offers individual contribution analysis of each biomarker, which is important in assisting physicians' decision-making processes. We validated our method on a public chest low-dose CT dataset and a private external chest standard-dose CT patient cohort of 17,207 CT volumes from 6,393 unique subjects, and demonstrated superior predictive performance, achieving AUCs of 0.875 and 0.843, respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18610v2-abstract-full').style.display = 'none'; document.getElementById('2410.18610v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">23 pages, 9 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.17607">arXiv:2410.17607</a> <span> [<a href="https://arxiv.org/pdf/2410.17607">pdf</a>, <a href="https://arxiv.org/format/2410.17607">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Exploiting Data Centres and Local Energy Communities Synergies for Market Participation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Paredes%2C+%C3%81">脕ngel Paredes</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+Y">Yihong Zhou</a>, <a href="/search/eess?searchtype=author&query=Essayeh%2C+C">Chaimaa Essayeh</a>, <a href="/search/eess?searchtype=author&query=Aguado%2C+J+A">Jos茅 A. Aguado</a>, <a href="/search/eess?searchtype=author&query=Morstyn%2C+T">Thomas Morstyn</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.17607v2-abstract-short" style="display: inline;"> The evolving energy landscape has propelled energy communities to the forefront of modern energy management. However, existing research has yet to explore the potential synergies between data centres and energy communities, necessitating an assessment on their collective capabilities for cost efficiency, waste heat optimisation, and market participation. This paper presents a mixed integer linear… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17607v2-abstract-full').style.display = 'inline'; document.getElementById('2410.17607v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.17607v2-abstract-full" style="display: none;"> The evolving energy landscape has propelled energy communities to the forefront of modern energy management. However, existing research has yet to explore the potential synergies between data centres and energy communities, necessitating an assessment on their collective capabilities for cost efficiency, waste heat optimisation, and market participation. This paper presents a mixed integer linear programming model to assess the collaborative performance of energy communities, data centres and energy markets. The evaluation focuses on the efficient use of waste heat and the flexibility of job scheduling while minimising system energy costs and maintaining quality of service requirements for data centres. Our results, based on realistic profiles of an energy community and a data centre, showcase significant benefits of these synergies, with a 38% reduction in operating costs and an 87% decrease in heat demand. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17607v2-abstract-full').style.display = 'none'; document.getElementById('2410.17607v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at IEEE PES ISGT Europe 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.17435">arXiv:2410.17435</a> <span> [<a href="https://arxiv.org/pdf/2410.17435">pdf</a>, <a href="https://arxiv.org/format/2410.17435">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> AI-focused HPC Data Centers Can Provide More Power Grid Flexibility and at Lower Cost </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhou%2C+Y">Yihong Zhou</a>, <a href="/search/eess?searchtype=author&query=Paredes%2C+A">Angel Paredes</a>, <a href="/search/eess?searchtype=author&query=Essayeh%2C+C">Chaimaa Essayeh</a>, <a href="/search/eess?searchtype=author&query=Morstyn%2C+T">Thomas Morstyn</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.17435v2-abstract-short" style="display: inline;"> The recent growth of Artificial Intelligence (AI), particularly large language models, requires energy-demanding high-performance computing (HPC) data centers, which poses a significant burden on power system capacity. Scheduling data center computing jobs to manage power demand can alleviate network stress with minimal infrastructure investment and contribute to fast time-scale power system balan… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17435v2-abstract-full').style.display = 'inline'; document.getElementById('2410.17435v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.17435v2-abstract-full" style="display: none;"> The recent growth of Artificial Intelligence (AI), particularly large language models, requires energy-demanding high-performance computing (HPC) data centers, which poses a significant burden on power system capacity. Scheduling data center computing jobs to manage power demand can alleviate network stress with minimal infrastructure investment and contribute to fast time-scale power system balancing. This study, for the first time, comprehensively analyzes the capability and cost of grid flexibility provision by GPU-heavy AI-focused HPC data centers, along with a comparison with CPU-heavy general-purpose HPC data centers traditionally used for scientific computing. A data center flexibility cost model is proposed that accounts for the value of computing. Using real-world computing traces from 7 AI-focused HPC data centers and 7 general-purpose HPC data centers, along with computing prices from 3 cloud platforms, we find that AI-focused HPC data centers can offer greater flexibility at 50% lower cost compared to general-purpose HPC data centers for a range of power system services. By comparing the cost to flexibility market prices, we illustrate the financial profitability of flexibility provision for AI-focused HPC data centers. Finally, our flexibility and cost estimates can be scaled using parameters of other data centers through algebraic operations, avoiding the need for re-optimization. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17435v2-abstract-full').style.display = 'none'; document.getElementById('2410.17435v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The supplementary datasets are available at Zenodo: https://zenodo.org/records/14454915</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.14116">arXiv:2410.14116</a> <span> [<a href="https://arxiv.org/pdf/2410.14116">pdf</a>, <a href="https://arxiv.org/ps/2410.14116">ps</a>, <a href="https://arxiv.org/format/2410.14116">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> </div> </div> <p class="title is-5 mathjax"> Robustness to Model Approximation, Empirical Model Learning, and Sample Complexity in Wasserstein Regular MDPs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhou%2C+Y">Yichen Zhou</a>, <a href="/search/eess?searchtype=author&query=Song%2C+Y">Yanglei Song</a>, <a href="/search/eess?searchtype=author&query=Y%C3%BCksel%2C+S">Serdar Y眉ksel</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.14116v3-abstract-short" style="display: inline;"> The paper studies the robustness properties of discrete-time stochastic optimal control under Wasserstein model approximation for both discounted cost and average cost criteria. Specifically, we study the performance loss when applying an optimal policy designed for an approximate model to the true dynamics compared with the optimal cost for the true model under the sup-norm-induced metric, and re… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14116v3-abstract-full').style.display = 'inline'; document.getElementById('2410.14116v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.14116v3-abstract-full" style="display: none;"> The paper studies the robustness properties of discrete-time stochastic optimal control under Wasserstein model approximation for both discounted cost and average cost criteria. Specifically, we study the performance loss when applying an optimal policy designed for an approximate model to the true dynamics compared with the optimal cost for the true model under the sup-norm-induced metric, and relate it to the Wasserstein-1 distance between the approximate and true transition kernels. A primary motivation of this analysis is empirical model learning, as well as empirical noise distribution learning, where Wasserstein convergence holds under mild conditions but stronger convergence criteria, such as total variation, may not. We discuss applications of the results to the disturbance estimation problem, where sample complexity bounds are given, and also to a general empirical model learning approach, obtained under either Markov or i.i.d.~learning settings. Further applications regarding the continuity of invariant probability measures with respect to transition kernels are also discussed. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14116v3-abstract-full').style.display = 'none'; document.getElementById('2410.14116v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">35 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.09674">arXiv:2410.09674</a> <span> [<a href="https://arxiv.org/pdf/2410.09674">pdf</a>, <a href="https://arxiv.org/format/2410.09674">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> </div> </div> <p class="title is-5 mathjax"> EG-SpikeFormer: Eye-Gaze Guided Transformer on Spiking Neural Networks for Medical Image Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Pan%2C+Y">Yi Pan</a>, <a href="/search/eess?searchtype=author&query=Jiang%2C+H">Hanqi Jiang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+J">Junhao Chen</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yiwei Li</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+H">Huaqin Zhao</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+Y">Yifan Zhou</a>, <a href="/search/eess?searchtype=author&query=Shu%2C+P">Peng Shu</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+Z">Zihao Wu</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+Z">Zhengliang Liu</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+D">Dajiang Zhu</a>, <a href="/search/eess?searchtype=author&query=Li%2C+X">Xiang Li</a>, <a href="/search/eess?searchtype=author&query=Abate%2C+Y">Yohannes Abate</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+T">Tianming Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.09674v2-abstract-short" style="display: inline;"> Neuromorphic computing has emerged as a promising energy-efficient alternative to traditional artificial intelligence, predominantly utilizing spiking neural networks (SNNs) implemented on neuromorphic hardware. Significant advancements have been made in SNN-based convolutional neural networks (CNNs) and Transformer architectures. However, neuromorphic computing for the medical imaging domain rema… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09674v2-abstract-full').style.display = 'inline'; document.getElementById('2410.09674v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.09674v2-abstract-full" style="display: none;"> Neuromorphic computing has emerged as a promising energy-efficient alternative to traditional artificial intelligence, predominantly utilizing spiking neural networks (SNNs) implemented on neuromorphic hardware. Significant advancements have been made in SNN-based convolutional neural networks (CNNs) and Transformer architectures. However, neuromorphic computing for the medical imaging domain remains underexplored. In this study, we introduce EG-SpikeFormer, an SNN architecture tailored for clinical tasks that incorporates eye-gaze data to guide the model's attention to the diagnostically relevant regions in medical images. Our developed approach effectively addresses shortcut learning issues commonly observed in conventional models, especially in scenarios with limited clinical data and high demands for model reliability, generalizability, and transparency. Our EG-SpikeFormer not only demonstrates superior energy efficiency and performance in medical image prediction tasks but also enhances clinical relevance through multi-modal information alignment. By incorporating eye-gaze data, the model improves interpretability and generalization, opening new directions for applying neuromorphic computing in healthcare. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09674v2-abstract-full').style.display = 'none'; document.getElementById('2410.09674v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.09406">arXiv:2410.09406</a> <span> [<a href="https://arxiv.org/pdf/2410.09406">pdf</a>, <a href="https://arxiv.org/format/2410.09406">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Quantum Physics">quant-ph</span> </div> </div> <p class="title is-5 mathjax"> Quantum Neural Network for Accelerated Magnetic Resonance Imaging </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhou%2C+S">Shuo Zhou</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+Y">Yihang Zhou</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+C">Congcong Liu</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+Y">Yanjie Zhu</a>, <a href="/search/eess?searchtype=author&query=Zheng%2C+H">Hairong Zheng</a>, <a href="/search/eess?searchtype=author&query=Liang%2C+D">Dong Liang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+H">Haifeng Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.09406v1-abstract-short" style="display: inline;"> Magnetic resonance image reconstruction starting from undersampled k-space data requires the recovery of many potential nonlinear features, which is very difficult for algorithms to recover these features. In recent years, the development of quantum computing has discovered that quantum convolution can improve network accuracy, possibly due to potential quantum advantages. This article proposes a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09406v1-abstract-full').style.display = 'inline'; document.getElementById('2410.09406v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.09406v1-abstract-full" style="display: none;"> Magnetic resonance image reconstruction starting from undersampled k-space data requires the recovery of many potential nonlinear features, which is very difficult for algorithms to recover these features. In recent years, the development of quantum computing has discovered that quantum convolution can improve network accuracy, possibly due to potential quantum advantages. This article proposes a hybrid neural network containing quantum and classical networks for fast magnetic resonance imaging, and conducts experiments on a quantum computer simulation system. The experimental results indicate that the hybrid network has achieved excellent reconstruction results, and also confirm the feasibility of applying hybrid quantum-classical neural networks into the image reconstruction of rapid magnetic resonance imaging. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09406v1-abstract-full').style.display = 'none'; document.getElementById('2410.09406v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at 2024 IEEE International Conference on Imaging Systems and Techniques (IST 2024)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.07572">arXiv:2410.07572</a> <span> [<a href="https://arxiv.org/pdf/2410.07572">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Optics">physics.optics</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Edge-guided inverse design of digital metamaterials for ultra-high-capacity on-chip multi-dimensional interconnect </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Sun%2C+A">Aolong Sun</a>, <a href="/search/eess?searchtype=author&query=Xing%2C+S">Sizhe Xing</a>, <a href="/search/eess?searchtype=author&query=Deng%2C+X">Xuyu Deng</a>, <a href="/search/eess?searchtype=author&query=Shen%2C+R">Ruoyu Shen</a>, <a href="/search/eess?searchtype=author&query=Yan%2C+A">An Yan</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+F">Fangchen Hu</a>, <a href="/search/eess?searchtype=author&query=Yuan%2C+Y">Yuqin Yuan</a>, <a href="/search/eess?searchtype=author&query=Dong%2C+B">Boyu Dong</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+J">Junhao Zhao</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+O">Ouhan Huang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Z">Ziwei Li</a>, <a href="/search/eess?searchtype=author&query=Shi%2C+J">Jianyang Shi</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+Y">Yingjun Zhou</a>, <a href="/search/eess?searchtype=author&query=Shen%2C+C">Chao Shen</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+Y">Yiheng Zhao</a>, <a href="/search/eess?searchtype=author&query=Hong%2C+B">Bingzhou Hong</a>, <a href="/search/eess?searchtype=author&query=Chu%2C+W">Wei Chu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+J">Junwen Zhang</a>, <a href="/search/eess?searchtype=author&query=Cai%2C+H">Haiwen Cai</a>, <a href="/search/eess?searchtype=author&query=Chi%2C+N">Nan Chi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.07572v1-abstract-short" style="display: inline;"> The escalating demands of compute-intensive applications, including artificial intelligence, urgently necessitate the adoption of sophisticated optical on-chip interconnect technologies to overcome critical bottlenecks in scaling future computing systems. This transition requires leveraging the inherent parallelism of wavelength and mode dimensions of light, complemented by high-order modulation f… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07572v1-abstract-full').style.display = 'inline'; document.getElementById('2410.07572v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.07572v1-abstract-full" style="display: none;"> The escalating demands of compute-intensive applications, including artificial intelligence, urgently necessitate the adoption of sophisticated optical on-chip interconnect technologies to overcome critical bottlenecks in scaling future computing systems. This transition requires leveraging the inherent parallelism of wavelength and mode dimensions of light, complemented by high-order modulation formats, to significantly enhance data throughput. Here we experimentally demonstrate a novel synergy of these three dimensions, achieving multi-tens-of-terabits-per-second on-chip interconnects using ultra-broadband, multi-mode digital metamaterials. Employing a highly efficient edge-guided analog-and-digital optimization method, we inversely design foundry-compatible, robust, and multi-port digital metamaterials with an 8xhigher computational efficiency. Using a packaged five-mode multiplexing chip, we demonstrate a single-wavelength interconnect capacity of 1.62 Tbit s-1 and a record-setting multi-dimensional interconnect capacity of 38.2 Tbit s-1 across 5 modes and 88 wavelength channels. A theoretical analysis suggests that further system optimization can enable on-chip interconnects to reach sub-petabit-per-second data transmission rates. This study highlights the transformative potential of optical interconnect technologies to surmount the constraints of electronic links, thus setting the stage for next-generation datacenter and optical compute interconnects. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07572v1-abstract-full').style.display = 'none'; document.getElementById('2410.07572v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Zhou%2C+Y&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Zhou%2C+Y&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhou%2C+Y&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhou%2C+Y&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhou%2C+Y&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhou%2C+Y&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository