Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 288 results for author: <span class="mathjax">Gao, Y</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/eess" aria-role="search"> Searching in archive <strong>eess</strong>. <a href="/search/?searchtype=author&query=Gao%2C+Y">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Gao, Y"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Gao%2C+Y&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Gao, Y"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Gao%2C+Y&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Gao%2C+Y&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Gao%2C+Y&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Gao%2C+Y&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Gao%2C+Y&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Gao%2C+Y&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li> <a href="/search/?searchtype=author&query=Gao%2C+Y&start=250" class="pagination-link " aria-label="Page 6" aria-current="page">6 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14865">arXiv:2411.14865</a> <span> [<a href="https://arxiv.org/pdf/2411.14865">pdf</a>, <a href="https://arxiv.org/format/2411.14865">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Benchmarking the Robustness of Optical Flow Estimation to Corruptions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yi%2C+Z">Zhonghua Yi</a>, <a href="/search/eess?searchtype=author&query=Shi%2C+H">Hao Shi</a>, <a href="/search/eess?searchtype=author&query=Jiang%2C+Q">Qi Jiang</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+Y">Yao Gao</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Z">Ze Wang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Y">Yufan Zhang</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+K">Kailun Yang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+K">Kaiwei Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14865v1-abstract-short" style="display: inline;"> Optical flow estimation is extensively used in autonomous driving and video editing. While existing models demonstrate state-of-the-art performance across various benchmarks, the robustness of these methods has been infrequently investigated. Despite some research focusing on the robustness of optical flow models against adversarial attacks, there has been a lack of studies investigating their rob… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14865v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14865v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14865v1-abstract-full" style="display: none;"> Optical flow estimation is extensively used in autonomous driving and video editing. While existing models demonstrate state-of-the-art performance across various benchmarks, the robustness of these methods has been infrequently investigated. Despite some research focusing on the robustness of optical flow models against adversarial attacks, there has been a lack of studies investigating their robustness to common corruptions. Taking into account the unique temporal characteristics of optical flow, we introduce 7 temporal corruptions specifically designed for benchmarking the robustness of optical flow models, in addition to 17 classical single-image corruptions, in which advanced PSF Blur simulation method is performed. Two robustness benchmarks, KITTI-FC and GoPro-FC, are subsequently established as the first corruption robustness benchmark for optical flow estimation, with Out-Of-Domain (OOD) and In-Domain (ID) settings to facilitate comprehensive studies. Robustness metrics, Corruption Robustness Error (CRE), Corruption Robustness Error ratio (CREr), and Relative Corruption Robustness Error (RCRE) are further introduced to quantify the optical flow estimation robustness. 29 model variants from 15 optical flow methods are evaluated, yielding 10 intriguing observations, such as 1) the absolute robustness of the model is heavily dependent on the estimation performance; 2) the corruptions that diminish local information are more serious than that reduce visual effects. We also give suggestions for the design and application of optical flow models. We anticipate that our benchmark will serve as a foundational resource for advancing research in robust optical flow estimation. The benchmarks and source code will be released at https://github.com/ZhonghuaYi/optical_flow_robustness_benchmark. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14865v1-abstract-full').style.display = 'none'; document.getElementById('2411.14865v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The benchmarks and source code will be released at https://github.com/ZhonghuaYi/optical_flow_robustness_benchmark</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13785">arXiv:2411.13785</a> <span> [<a href="https://arxiv.org/pdf/2411.13785">pdf</a>, <a href="https://arxiv.org/ps/2411.13785">ps</a>, <a href="https://arxiv.org/format/2411.13785">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Throughput Maximization for Movable Antenna Systems with Movement Delay Consideration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+H">Honghao Wang</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+Q">Qingqing Wu</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+Y">Ying Gao</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+W">Wen Chen</a>, <a href="/search/eess?searchtype=author&query=Mei%2C+W">Weidong Mei</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+G">Guojie Hu</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+L">Lexi Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13785v1-abstract-short" style="display: inline;"> In this paper, we model the minimum achievable throughput within a transmission block of restricted duration and aim to maximize it in movable antenna (MA)-enabled multiuser downlink communications. Particularly, we account for the antenna moving delay caused by mechanical movement, which has not been fully considered in previous studies, and reveal the trade-off between the delay and signal-to-in… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13785v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13785v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13785v1-abstract-full" style="display: none;"> In this paper, we model the minimum achievable throughput within a transmission block of restricted duration and aim to maximize it in movable antenna (MA)-enabled multiuser downlink communications. Particularly, we account for the antenna moving delay caused by mechanical movement, which has not been fully considered in previous studies, and reveal the trade-off between the delay and signal-to-interference-plus-noise ratio at users. To this end, we first consider a single-user setup to analyze the necessity of antenna movement. By quantizing the virtual angles of arrival, we derive the requisite region size for antenna moving, design the initial MA position, and elucidate the relationship between quantization resolution and moving region size. Furthermore, an efficient algorithm is developed to optimize MA position via successive convex approximation, which is subsequently extended to the general multiuser setup. Numerical results demonstrate that the proposed algorithms outperform fixed-position antenna schemes and existing ones without consideration of movement delay. Additionally, our algorithms exhibit excellent adaptability and stability across various transmission block durations and moving region sizes, and are robust to different antenna moving speeds. This allows the hardware cost of MA-aided systems to be reduced by employing low rotational speed motors. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13785v1-abstract-full').style.display = 'none'; document.getElementById('2411.13785v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13560">arXiv:2411.13560</a> <span> [<a href="https://arxiv.org/pdf/2411.13560">pdf</a>, <a href="https://arxiv.org/format/2411.13560">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> AMSnet-KG: A Netlist Dataset for LLM-based AMS Circuit Auto-Design Using Knowledge Graph RAG </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Shi%2C+Y">Yichen Shi</a>, <a href="/search/eess?searchtype=author&query=Tao%2C+Z">Zhuofu Tao</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+Y">Yuhao Gao</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+T">Tianjia Zhou</a>, <a href="/search/eess?searchtype=author&query=Chang%2C+C">Cheng Chang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yaxing Wang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+B">Bingyu Chen</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+G">Genhao Zhang</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+A">Alvin Liu</a>, <a href="/search/eess?searchtype=author&query=Yu%2C+Z">Zhiping Yu</a>, <a href="/search/eess?searchtype=author&query=Lin%2C+T">Ting-Jung Lin</a>, <a href="/search/eess?searchtype=author&query=He%2C+L">Lei He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13560v1-abstract-short" style="display: inline;"> High-performance analog and mixed-signal (AMS) circuits are mainly full-custom designed, which is time-consuming and labor-intensive. A significant portion of the effort is experience-driven, which makes the automation of AMS circuit design a formidable challenge. Large language models (LLMs) have emerged as powerful tools for Electronic Design Automation (EDA) applications, fostering advancements… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13560v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13560v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13560v1-abstract-full" style="display: none;"> High-performance analog and mixed-signal (AMS) circuits are mainly full-custom designed, which is time-consuming and labor-intensive. A significant portion of the effort is experience-driven, which makes the automation of AMS circuit design a formidable challenge. Large language models (LLMs) have emerged as powerful tools for Electronic Design Automation (EDA) applications, fostering advancements in the automatic design process for large-scale AMS circuits. However, the absence of high-quality datasets has led to issues such as model hallucination, which undermines the robustness of automatically generated circuit designs. To address this issue, this paper introduces AMSnet-KG, a dataset encompassing various AMS circuit schematics and netlists. We construct a knowledge graph with annotations on detailed functional and performance characteristics. Facilitated by AMSnet-KG, we propose an automated AMS circuit generation framework that utilizes the comprehensive knowledge embedded in LLMs. We first formulate a design strategy (e.g., circuit architecture using a number of circuit components) based on required specifications. Next, matched circuit components are retrieved and assembled into a complete topology, and transistor sizing is obtained through Bayesian optimization. Simulation results of the netlist are fed back to the LLM for further topology refinement, ensuring the circuit design specifications are met. We perform case studies of operational amplifier and comparator design to verify the automatic design flow from specifications to netlists with minimal human effort. The dataset used in this paper will be open-sourced upon publishing of this paper. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13560v1-abstract-full').style.display = 'none'; document.getElementById('2411.13560v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06738">arXiv:2411.06738</a> <span> [<a href="https://arxiv.org/pdf/2411.06738">pdf</a>, <a href="https://arxiv.org/format/2411.06738">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> 360-Degree Video Super Resolution and Quality Enhancement Challenge: Methods and Results </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Telili%2C+A">Ahmed Telili</a>, <a href="/search/eess?searchtype=author&query=Hamidouche%2C+W">Wassim Hamidouche</a>, <a href="/search/eess?searchtype=author&query=Farhat%2C+I">Ibrahim Farhat</a>, <a href="/search/eess?searchtype=author&query=Amirpour%2C+H">Hadi Amirpour</a>, <a href="/search/eess?searchtype=author&query=Timmerer%2C+C">Christian Timmerer</a>, <a href="/search/eess?searchtype=author&query=Khadraoui%2C+I">Ibrahim Khadraoui</a>, <a href="/search/eess?searchtype=author&query=Lu%2C+J">Jiajie Lu</a>, <a href="/search/eess?searchtype=author&query=Van+Le%2C+T">The Van Le</a>, <a href="/search/eess?searchtype=author&query=Baek%2C+J">Jeonneung Baek</a>, <a href="/search/eess?searchtype=author&query=Lee%2C+J+Y">Jin Young Lee</a>, <a href="/search/eess?searchtype=author&query=Wei%2C+Y">Yiying Wei</a>, <a href="/search/eess?searchtype=author&query=Sun%2C+X">Xiaopeng Sun</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+Y">Yu Gao</a>, <a href="/search/eess?searchtype=author&query=Huangl%2C+J">JianCheng Huangl</a>, <a href="/search/eess?searchtype=author&query=Zhong%2C+Y">Yujie Zhong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06738v1-abstract-short" style="display: inline;"> Omnidirectional (360-degree) video is rapidly gaining popularity due to advancements in immersive technologies like virtual reality (VR) and extended reality (XR). However, real-time streaming of such videos, especially in live mobile scenarios like unmanned aerial vehicles (UAVs), is challenged by limited bandwidth and strict latency constraints. Traditional methods, such as compression and adapt… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06738v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06738v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06738v1-abstract-full" style="display: none;"> Omnidirectional (360-degree) video is rapidly gaining popularity due to advancements in immersive technologies like virtual reality (VR) and extended reality (XR). However, real-time streaming of such videos, especially in live mobile scenarios like unmanned aerial vehicles (UAVs), is challenged by limited bandwidth and strict latency constraints. Traditional methods, such as compression and adaptive resolution, help but often compromise video quality and introduce artifacts that degrade the viewer experience. Additionally, the unique spherical geometry of 360-degree video presents challenges not encountered in traditional 2D video. To address these issues, we initiated the 360-degree Video Super Resolution and Quality Enhancement Challenge. This competition encourages participants to develop efficient machine learning solutions to enhance the quality of low-bitrate compressed 360-degree videos, with two tracks focusing on 2x and 4x super-resolution (SR). In this paper, we outline the challenge framework, detailing the two competition tracks and highlighting the SR solutions proposed by the top-performing models. We assess these models within a unified framework, considering quality enhancement, bitrate gain, and computational efficiency. This challenge aims to drive innovation in real-time 360-degree video streaming, improving the quality and accessibility of immersive visual experiences. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06738v1-abstract-full').style.display = 'none'; document.getElementById('2411.06738v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages, 9 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04611">arXiv:2411.04611</a> <span> [<a href="https://arxiv.org/pdf/2411.04611">pdf</a>, <a href="https://arxiv.org/format/2411.04611">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Compressive Spectrum Sensing with 1-bit ADCs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yang%2C+J">Jian Yang</a>, <a href="/search/eess?searchtype=author&query=Song%2C+Z">Zihang Song</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+H">Han Zhang</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+Y">Yue Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04611v1-abstract-short" style="display: inline;"> Efficient wideband spectrum sensing (WSS) is essential for managing spectrum scarcity in wireless communications. However, existing compressed sensing (CS)-based WSS methods require high sampling rates and power consumption, particularly with high-precision analog-to-digital converters (ADCs). Although 1-bit CS with low-precision ADCs can mitigate these demands, most approaches still depend on mul… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04611v1-abstract-full').style.display = 'inline'; document.getElementById('2411.04611v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04611v1-abstract-full" style="display: none;"> Efficient wideband spectrum sensing (WSS) is essential for managing spectrum scarcity in wireless communications. However, existing compressed sensing (CS)-based WSS methods require high sampling rates and power consumption, particularly with high-precision analog-to-digital converters (ADCs). Although 1-bit CS with low-precision ADCs can mitigate these demands, most approaches still depend on multi-user cooperation and prior sparsity information, which are often unavailable in WSS scenarios. This paper introduces a non-cooperative WSS method using multicoset sampling with 1-bit ADCs to achieve sub-Nyquist sampling without requiring sparsity knowledge. We analyze the impact of 1-bit quantization on multiband signals, then apply eigenvalue decomposition to isolate the signal subspace from noise, enabling spectrum support estimation without signal reconstruction. This approach provides a power-efficient solution for WSS that eliminates the need for cooperation and prior information. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04611v1-abstract-full').style.display = 'none'; document.getElementById('2411.04611v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00900">arXiv:2411.00900</a> <span> [<a href="https://arxiv.org/pdf/2411.00900">pdf</a>, <a href="https://arxiv.org/format/2411.00900">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Intensity Field Decomposition for Tissue-Guided Neural Tomography </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+M">Meng-Xun Li</a>, <a href="/search/eess?searchtype=author&query=Yu%2C+J">Jin-Gang Yu</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+Y">Yuan Gao</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+C">Cui Huang</a>, <a href="/search/eess?searchtype=author&query=Xia%2C+G">Gui-Song Xia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00900v1-abstract-short" style="display: inline;"> Cone-beam computed tomography (CBCT) typically requires hundreds of X-ray projections, which raises concerns about radiation exposure. While sparse-view reconstruction reduces the exposure by using fewer projections, it struggles to achieve satisfactory image quality. To address this challenge, this article introduces a novel sparse-view CBCT reconstruction method, which empowers the neural field… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00900v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00900v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00900v1-abstract-full" style="display: none;"> Cone-beam computed tomography (CBCT) typically requires hundreds of X-ray projections, which raises concerns about radiation exposure. While sparse-view reconstruction reduces the exposure by using fewer projections, it struggles to achieve satisfactory image quality. To address this challenge, this article introduces a novel sparse-view CBCT reconstruction method, which empowers the neural field with human tissue regularization. Our approach, termed tissue-guided neural tomography (TNT), is motivated by the distinct intensity differences between bone and soft tissue in CBCT. Intuitively, separating these components may aid the learning process of the neural field. More precisely, TNT comprises a heterogeneous quadruple network and the corresponding training strategy. The network represents the intensity field as a combination of soft and hard tissue components, along with their respective textures. We train the network with guidance from estimated tissue projections, enabling efficient learning of the desired patterns for the network heads. Extensive experiments demonstrate that the proposed method significantly improves the sparse-view CBCT reconstruction with a limited number of projections ranging from 10 to 60. Our method achieves comparable reconstruction quality with fewer projections and faster convergence compared to state-of-the-art neural rendering based methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00900v1-abstract-full').style.display = 'none'; document.getElementById('2411.00900v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.24039">arXiv:2410.24039</a> <span> [<a href="https://arxiv.org/pdf/2410.24039">pdf</a>, <a href="https://arxiv.org/format/2410.24039">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Efficient Satellite-Ground Interconnection Design for Low-orbit Mega-Constellation Topology </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Liu%2C+W">Wenhao Liu</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+J">Jiazhi Wu</a>, <a href="/search/eess?searchtype=author&query=Lin%2C+Q">Quanwei Lin</a>, <a href="/search/eess?searchtype=author&query=Luo%2C+H">Handong Luo</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Q">Qi Zhang</a>, <a href="/search/eess?searchtype=author&query=Qiu%2C+K">Kun Qiu</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Z">Zhe Chen</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+Y">Yue Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.24039v1-abstract-short" style="display: inline;"> The low-orbit mega-constellation network (LMCN) is an important part of the space-air-ground integrated network system. An effective satellite-ground interconnection design can result in a stable constellation topology for LMCNs. A naive solution is accessing the satellite with the longest remaining service time (LRST), which is widely used in previous designs. The Coordinated Satellite-Ground Int… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.24039v1-abstract-full').style.display = 'inline'; document.getElementById('2410.24039v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.24039v1-abstract-full" style="display: none;"> The low-orbit mega-constellation network (LMCN) is an important part of the space-air-ground integrated network system. An effective satellite-ground interconnection design can result in a stable constellation topology for LMCNs. A naive solution is accessing the satellite with the longest remaining service time (LRST), which is widely used in previous designs. The Coordinated Satellite-Ground Interconnecting (CSGI), the state-of-the-art algorithm, coordinates the establishment of ground-satellite links (GSLs). Compared with existing solutions, it reduces latency by 19% and jitter by 70% on average. However, CSGI only supports the scenario where terminals access only one satellite and cannot fully utilize the multi-access capabilities of terminals. Additionally, CSGI's high computational complexity poses deployment challenges. To overcome these problems, we propose the Classification-based Longest Remaining Service Time (C-LRST) algorithm. C-LRST supports the actual scenario with multi-access capabilities. It adds optional paths during routing with low computational complexity, improving end-to-end communications quality. We conduct our 1000s simulation from Brazil to Lithuania on the open-source platform Hypatia. Experiment results show that compared with CSGI, C-LRST reduces the latency and increases the throughput by approximately 60% and 40%, respectively. In addition, C-LRST's GSL switching number is 14, whereas CSGI is 23. C-LRST has better link stability than CSGI. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.24039v1-abstract-full').style.display = 'none'; document.getElementById('2410.24039v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 14 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21351">arXiv:2410.21351</a> <span> [<a href="https://arxiv.org/pdf/2410.21351">pdf</a>, <a href="https://arxiv.org/format/2410.21351">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> LinFormer: A Linear-based Lightweight Transformer Architecture For Time-Aware MIMO Channel Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Jin%2C+Y">Yanliang Jin</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+Y">Yifan Wu</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+Y">Yuan Gao</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+S">Shunqing Zhang</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+S">Shugong Xu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+C">Cheng-Xiang Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21351v1-abstract-short" style="display: inline;"> The emergence of 6th generation (6G) mobile networks brings new challenges in supporting high-mobility communications, particularly in addressing the issue of channel aging. While existing channel prediction methods offer improved accuracy at the expense of increased computational complexity, limiting their practical application in mobile networks. To address these challenges, we present LinFormer… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21351v1-abstract-full').style.display = 'inline'; document.getElementById('2410.21351v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21351v1-abstract-full" style="display: none;"> The emergence of 6th generation (6G) mobile networks brings new challenges in supporting high-mobility communications, particularly in addressing the issue of channel aging. While existing channel prediction methods offer improved accuracy at the expense of increased computational complexity, limiting their practical application in mobile networks. To address these challenges, we present LinFormer, an innovative channel prediction framework based on a scalable, all-linear, encoder-only Transformer model. Our approach, inspired by natural language processing (NLP) models such as BERT, adapts an encoder-only architecture specifically for channel prediction tasks. We propose replacing the computationally intensive attention mechanism commonly used in Transformers with a time-aware multi-layer perceptron (TMLP), significantly reducing computational demands. The inherent time awareness of TMLP module makes it particularly suitable for channel prediction tasks. We enhance LinFormer's training process by employing a weighted mean squared error loss (WMSELoss) function and data augmentation techniques, leveraging larger, readily available communication datasets. Our approach achieves a substantial reduction in computational complexity while maintaining high prediction accuracy, making it more suitable for deployment in cost-effective base stations (BS). Comprehensive experiments using both simulated and measured data demonstrate that LinFormer outperforms existing methods across various mobility scenarios, offering a promising solution for future wireless communication systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21351v1-abstract-full').style.display = 'none'; document.getElementById('2410.21351v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20514">arXiv:2410.20514</a> <span> [<a href="https://arxiv.org/pdf/2410.20514">pdf</a>, <a href="https://arxiv.org/format/2410.20514">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Uncertainty-Aware Decision-Making and Planning for Autonomous Forced Merging </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhou%2C+J">Jian Zhou</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+Y">Yulong Gao</a>, <a href="/search/eess?searchtype=author&query=Olofsson%2C+B">Bj枚rn Olofsson</a>, <a href="/search/eess?searchtype=author&query=Frisk%2C+E">Erik Frisk</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20514v1-abstract-short" style="display: inline;"> In this paper, we develop an uncertainty-aware decision-making and motion-planning method for an autonomous ego vehicle in forced merging scenarios, considering the motion uncertainty of surrounding vehicles. The method dynamically captures the uncertainty of surrounding vehicles by online estimation of their acceleration bounds, enabling a reactive but rapid understanding of the uncertainty chara… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20514v1-abstract-full').style.display = 'inline'; document.getElementById('2410.20514v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20514v1-abstract-full" style="display: none;"> In this paper, we develop an uncertainty-aware decision-making and motion-planning method for an autonomous ego vehicle in forced merging scenarios, considering the motion uncertainty of surrounding vehicles. The method dynamically captures the uncertainty of surrounding vehicles by online estimation of their acceleration bounds, enabling a reactive but rapid understanding of the uncertainty characteristics of the surrounding vehicles. By leveraging these estimated bounds, a non-conservative forward occupancy of surrounding vehicles is predicted over a horizon, which is incorporated in both the decision-making process and the motion-planning strategy, to enhance the resilience and safety of the planned reference trajectory. The method successfully fulfills the tasks in challenging forced merging scenarios, and the properties are illustrated by comparison with several alternative approaches. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20514v1-abstract-full').style.display = 'none'; document.getElementById('2410.20514v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by the 63rd IEEE Conference on Decision and Control, 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13267">arXiv:2410.13267</a> <span> [<a href="https://arxiv.org/pdf/2410.13267">pdf</a>, <a href="https://arxiv.org/format/2410.13267">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> CLaMP 2: Multimodal Music Information Retrieval Across 101 Languages Using Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wu%2C+S">Shangda Wu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yashan Wang</a>, <a href="/search/eess?searchtype=author&query=Yuan%2C+R">Ruibin Yuan</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+Z">Zhancheng Guo</a>, <a href="/search/eess?searchtype=author&query=Tan%2C+X">Xu Tan</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+G">Ge Zhang</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+M">Monan Zhou</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+J">Jing Chen</a>, <a href="/search/eess?searchtype=author&query=Mu%2C+X">Xuefeng Mu</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+Y">Yuejie Gao</a>, <a href="/search/eess?searchtype=author&query=Dong%2C+Y">Yuanliang Dong</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+J">Jiafeng Liu</a>, <a href="/search/eess?searchtype=author&query=Li%2C+X">Xiaobing Li</a>, <a href="/search/eess?searchtype=author&query=Yu%2C+F">Feng Yu</a>, <a href="/search/eess?searchtype=author&query=Sun%2C+M">Maosong Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13267v1-abstract-short" style="display: inline;"> Challenges in managing linguistic diversity and integrating various musical modalities are faced by current music information retrieval systems. These limitations reduce their effectiveness in a global, multimodal music environment. To address these issues, we introduce CLaMP 2, a system compatible with 101 languages that supports both ABC notation (a text-based musical notation format) and MIDI (… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13267v1-abstract-full').style.display = 'inline'; document.getElementById('2410.13267v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13267v1-abstract-full" style="display: none;"> Challenges in managing linguistic diversity and integrating various musical modalities are faced by current music information retrieval systems. These limitations reduce their effectiveness in a global, multimodal music environment. To address these issues, we introduce CLaMP 2, a system compatible with 101 languages that supports both ABC notation (a text-based musical notation format) and MIDI (Musical Instrument Digital Interface) for music information retrieval. CLaMP 2, pre-trained on 1.5 million ABC-MIDI-text triplets, includes a multilingual text encoder and a multimodal music encoder aligned via contrastive learning. By leveraging large language models, we obtain refined and consistent multilingual descriptions at scale, significantly reducing textual noise and balancing language distribution. Our experiments show that CLaMP 2 achieves state-of-the-art results in both multilingual semantic search and music classification across modalities, thus establishing a new standard for inclusive and global music information retrieval. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13267v1-abstract-full').style.display = 'none'; document.getElementById('2410.13267v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages, 10 figures, 4 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.12438">arXiv:2410.12438</a> <span> [<a href="https://arxiv.org/pdf/2410.12438">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Modeling, Prediction and Risk Management of Distribution System Voltages with Non-Gaussian Probability Distributions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Gao%2C+Y">Yuanhai Gao</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+X">Xiaoyuan Xu</a>, <a href="/search/eess?searchtype=author&query=Yan%2C+Z">Zheng Yan</a>, <a href="/search/eess?searchtype=author&query=Shahidehpour%2C+M">Mohammad Shahidehpour</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+B">Bo Yang</a>, <a href="/search/eess?searchtype=author&query=Guan%2C+X">Xinping Guan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.12438v2-abstract-short" style="display: inline;"> High renewable energy penetration into power distribution systems causes a substantial risk of exceeding voltage security limits, which needs to be accurately assessed and properly managed. However, the existing methods usually rely on the joint probability models of power generation and loads provided by probabilistic prediction to quantify the voltage risks, where inaccurate prediction results c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12438v2-abstract-full').style.display = 'inline'; document.getElementById('2410.12438v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.12438v2-abstract-full" style="display: none;"> High renewable energy penetration into power distribution systems causes a substantial risk of exceeding voltage security limits, which needs to be accurately assessed and properly managed. However, the existing methods usually rely on the joint probability models of power generation and loads provided by probabilistic prediction to quantify the voltage risks, where inaccurate prediction results could lead to over or under estimated risks. This paper proposes an uncertain voltage component (UVC) prediction method for assessing and managing voltage risks. First, we define the UVC to evaluate voltage variations caused by the uncertainties associated with power generation and loads. Second, we propose a Gaussian mixture model-based probabilistic UVC prediction method to depict the non-Gaussian distribution of voltage variations. Then, we derive the voltage risk indices, including value-at-risk (VaR) and conditional value-at-risk (CVaR), based on the probabilistic UVC prediction model. Third, we investigate the mechanism of UVC-based voltage risk management and establish the voltage risk management problems, which are reformulated into linear programming or mixed-integer linear programming for convenient solutions. The proposed method is tested on power distribution systems with actual photovoltaic power and load data and compared with those considering probabilistic prediction of nodal power injections. Numerical results show that the proposed method is computationally efficient in assessing voltage risks and outperforms existing methods in managing voltage risks. The deviation of voltage risks obtained by the proposed method is only 15% of that by the methods based on probabilistic prediction of nodal power injections. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12438v2-abstract-full').style.display = 'none'; document.getElementById('2410.12438v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.11495">arXiv:2410.11495</a> <span> [<a href="https://arxiv.org/pdf/2410.11495">pdf</a>, <a href="https://arxiv.org/format/2410.11495">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> GBSense: A GHz-Bandwidth Compressed Spectrum Sensing System </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Song%2C+Z">Zihang Song</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+X">Xingjian Zhang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Z">Zhe Chen</a>, <a href="/search/eess?searchtype=author&query=Tafazolli%2C+R">Rahim Tafazolli</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+Y">Yue Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.11495v3-abstract-short" style="display: inline;"> This paper presents GBSense, an innovative compressed spectrum sensing system designed for GHz-bandwidth signals in dynamic spectrum access (DSA) applications. GBSense introduces an efficient approach to periodic nonuniform sampling, capturing wideband signals using significantly lower sampling rates compared to traditional Nyquist sampling. By integrating time-interleaved analog-to-digital conver… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11495v3-abstract-full').style.display = 'inline'; document.getElementById('2410.11495v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.11495v3-abstract-full" style="display: none;"> This paper presents GBSense, an innovative compressed spectrum sensing system designed for GHz-bandwidth signals in dynamic spectrum access (DSA) applications. GBSense introduces an efficient approach to periodic nonuniform sampling, capturing wideband signals using significantly lower sampling rates compared to traditional Nyquist sampling. By integrating time-interleaved analog-to-digital conversion, GBSense overcomes the hardware complexity typically associated with traditional multicoset sampling, providing precise, real-time adjustable sampling patterns without the need for analog delay circuits. The system's ability to process signals with a 2 GHz radio frequency bandwidth using only a 400 MHz average sampling rate enables more efficient spectrum monitoring and access in wideband cognitive radios. Lab tests demonstrate 100\% accurate spectrum detection when the spectrum occupancy is below 100 MHz and over 80\% accuracy for occupancy up to 200 MHz. Additionally, an integrated system utilizing a low-power Raspberry Pi processor achieves a low processing latency of around 30 ms per frame, demonstrating the system's potential for real-time applications in cognitive radio networks, 5G, and future 6G infrastructures. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11495v3-abstract-full').style.display = 'none'; document.getElementById('2410.11495v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.11002">arXiv:2410.11002</a> <span> [<a href="https://arxiv.org/pdf/2410.11002">pdf</a>, <a href="https://arxiv.org/format/2410.11002">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Optimizing Radio Access Technology Selection and Precoding in CV-Aided ISAC Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Gao%2C+Y">Yulan Gao</a>, <a href="/search/eess?searchtype=author&query=Ye%2C+Z">Ziqiang Ye</a>, <a href="/search/eess?searchtype=author&query=Xiao%2C+M">Ming Xiao</a>, <a href="/search/eess?searchtype=author&query=Xiao%2C+Y">Yue Xiao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.11002v1-abstract-short" style="display: inline;"> Integrated Sensing and Communication (ISAC) systems promise to revolutionize wireless networks by concurrently supporting high-resolution sensing and high-performance communication. This paper presents a novel radio access technology (RAT) selection framework that capitalizes on vision sensing from base station (BS) cameras to optimize both communication and perception capabilities within the ISAC… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11002v1-abstract-full').style.display = 'inline'; document.getElementById('2410.11002v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.11002v1-abstract-full" style="display: none;"> Integrated Sensing and Communication (ISAC) systems promise to revolutionize wireless networks by concurrently supporting high-resolution sensing and high-performance communication. This paper presents a novel radio access technology (RAT) selection framework that capitalizes on vision sensing from base station (BS) cameras to optimize both communication and perception capabilities within the ISAC system. Our framework strategically employs two distinct RATs, LTE and millimeter wave (mmWave), to enhance system performance. We propose a vision-based user localization method that employs a 3D detection technique to capture the spatial distribution of users within the surrounding environment. This is followed by geometric calculations to accurately determine the state of mmWave communication links between the BS and individual users. Additionally, we integrate the SlowFast model to recognize user activities, facilitating adaptive transmission rate allocation based on observed behaviors. We develop a Deep Deterministic Policy Gradient (DDPG)-based algorithm, utilizing the joint distribution of users and their activities, designed to maximize the total transmission rate for all users through joint RAT selection and precoding optimization, while adhering to constraints on sensing mutual information and minimum transmission rates. Numerical simulation results demonstrate the effectiveness of the proposed framework in dynamically adjusting resource allocation, ensuring high-quality communication under challenging conditions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11002v1-abstract-full').style.display = 'none'; document.getElementById('2410.11002v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.09834">arXiv:2410.09834</a> <span> [<a href="https://arxiv.org/pdf/2410.09834">pdf</a>, <a href="https://arxiv.org/format/2410.09834">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Towards Defining an Efficient and Expandable File Format for AI-Generated Contents </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Gao%2C+Y">Yixin Gao</a>, <a href="/search/eess?searchtype=author&query=Feng%2C+R">Runsen Feng</a>, <a href="/search/eess?searchtype=author&query=Li%2C+X">Xin Li</a>, <a href="/search/eess?searchtype=author&query=Li%2C+W">Weiping Li</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Z">Zhibo Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.09834v2-abstract-short" style="display: inline;"> Recently, AI-generated content (AIGC) has gained significant traction due to its powerful creation capability. However, the storage and transmission of large amounts of high-quality AIGC images inevitably pose new challenges for recent file formats. To overcome this, we define a new file format for AIGC images, named AIGIF, enabling ultra-low bitrate coding of AIGC images. Unlike compressing AIGC… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09834v2-abstract-full').style.display = 'inline'; document.getElementById('2410.09834v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.09834v2-abstract-full" style="display: none;"> Recently, AI-generated content (AIGC) has gained significant traction due to its powerful creation capability. However, the storage and transmission of large amounts of high-quality AIGC images inevitably pose new challenges for recent file formats. To overcome this, we define a new file format for AIGC images, named AIGIF, enabling ultra-low bitrate coding of AIGC images. Unlike compressing AIGC images intuitively with pixel-wise space as existing file formats, AIGIF instead compresses the generation syntax. This raises a crucial question: Which generation syntax elements, e.g., text prompt, device configuration, etc, are necessary for compression/transmission? To answer this question, we systematically investigate the effects of three essential factors: platform, generative model, and data configuration. We experimentally find that a well-designed composable bitstream structure incorporating the above three factors can achieve an impressive compression ratio of even up to 1/10,000 while still ensuring high fidelity. We also introduce an expandable syntax in AIGIF to support the extension of the most advanced generation models to be developed in the future. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09834v2-abstract-full').style.display = 'none'; document.getElementById('2410.09834v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.07591">arXiv:2410.07591</a> <span> [<a href="https://arxiv.org/pdf/2410.07591">pdf</a>, <a href="https://arxiv.org/format/2410.07591">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Robustness and Security Enhancement of Radio Frequency Fingerprint Identification in Time-Varying Channels </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yang%2C+L">Lu Yang</a>, <a href="/search/eess?searchtype=author&query=Camtepe%2C+S">Seyit Camtepe</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+Y">Yansong Gao</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+V">Vicky Liu</a>, <a href="/search/eess?searchtype=author&query=Jayalath%2C+D">Dhammika Jayalath</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.07591v1-abstract-short" style="display: inline;"> Radio frequency fingerprint identification (RFFI) is becoming increasingly popular, especially in applications with constrained power, such as the Internet of Things (IoT). Due to subtle manufacturing variations, wireless devices have unique radio frequency fingerprints (RFFs). These RFFs can be used with pattern recognition algorithms to classify wireless devices. However, Implementing reliable R… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07591v1-abstract-full').style.display = 'inline'; document.getElementById('2410.07591v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.07591v1-abstract-full" style="display: none;"> Radio frequency fingerprint identification (RFFI) is becoming increasingly popular, especially in applications with constrained power, such as the Internet of Things (IoT). Due to subtle manufacturing variations, wireless devices have unique radio frequency fingerprints (RFFs). These RFFs can be used with pattern recognition algorithms to classify wireless devices. However, Implementing reliable RFFI in time-varying channels is challenging because RFFs are often distorted by channel effects, reducing the classification accuracy. This paper introduces a new channel-robust RFF, and leverages transfer learning to enhance RFFI in the time-varying channels. Experimental results show that the proposed RFFI system achieved an average classification accuracy improvement of 33.3 % in indoor environments and 34.5 % in outdoor environments. This paper also analyzes the security of the proposed RFFI system to address the security flaw in formalized impersonation attacks. Since RFF collection is being carried out in uncontrolled deployment environments, RFFI systems can be targeted with false RFFs sent by rogue devices. The resulting classifiers may classify the rogue devices as legitimate, effectively replacing their true identities. To defend against impersonation attacks, a novel keyless countermeasure is proposed, which exploits the intrinsic output of the softmax function after classifier training without sacrificing the lightweight nature of RFFI. Experimental results demonstrate an average increase of 0.3 in the area under the receiver operating characteristic curve (AUC), with a 40.0 % improvement in attack detection rate in indoor and outdoor environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07591v1-abstract-full').style.display = 'none'; document.getElementById('2410.07591v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.02358">arXiv:2410.02358</a> <span> [<a href="https://arxiv.org/pdf/2410.02358">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Cross-Domain Comparative Analysis of Digital Twins and Universalised Solutions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Xiong%2C+G">Guanyu Xiong</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+Y">Yan Gao</a>, <a href="/search/eess?searchtype=author&query=Li%2C+H">Haijiang Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.02358v1-abstract-short" style="display: inline;"> Digitalisation is one of the main drivers of most economic sectors nowadays and the digital twin, as a reification of digitalisation for complex systems has attracted much attention from both academics and industry. There have been studies focusing on digital twins in a specific sector while there are few exercising insightful comparisons of digital twins from different domains. Considering the di… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02358v1-abstract-full').style.display = 'inline'; document.getElementById('2410.02358v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.02358v1-abstract-full" style="display: none;"> Digitalisation is one of the main drivers of most economic sectors nowadays and the digital twin, as a reification of digitalisation for complex systems has attracted much attention from both academics and industry. There have been studies focusing on digital twins in a specific sector while there are few exercising insightful comparisons of digital twins from different domains. Considering the digital twinning is a cross-domain transformation, it is beneficial to establish the principles of universality and variation that can explain similarities and differences in any digital twins. This paper first delivers a comparative analysis of digital twins in five domains through a six-dimensional characterisation framework. Then, by departing from the correlations among the domain-specific DT development, a cross-domain Digital Twin Platform-as-a-Service (DT-PaaS) is proposed to universalise the common process, tools and applications, meanwhile being inclusive of variations of every digital twin instance. As a centralised data, modeling and service platform, it is expected to break the barriers between domains by enabling the cross-domain digital twin data sharing, interoperability and development synergy and tackle some complex global challenges such as climate challenge, net zero, pandemics, etc. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02358v1-abstract-full').style.display = 'none'; document.getElementById('2410.02358v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.01841">arXiv:2410.01841</a> <span> [<a href="https://arxiv.org/pdf/2410.01841">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> A GEN AI Framework for Medical Note Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Leong%2C+H+Y">Hui Yi Leong</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+Y+F">Yi Fan Gao</a>, <a href="/search/eess?searchtype=author&query=Ji%2C+S">Shuai Ji</a>, <a href="/search/eess?searchtype=author&query=Kalaycioglu%2C+B">Bora Kalaycioglu</a>, <a href="/search/eess?searchtype=author&query=Pamuksuz%2C+U">Uktu Pamuksuz</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.01841v1-abstract-short" style="display: inline;"> The increasing administrative burden of medical documentation, particularly through Electronic Health Records (EHR), significantly reduces the time available for direct patient care and contributes to physician burnout. To address this issue, we propose MediNotes, an advanced generative AI framework designed to automate the creation of SOAP (Subjective, Objective, Assessment, Plan) notes from medi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01841v1-abstract-full').style.display = 'inline'; document.getElementById('2410.01841v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.01841v1-abstract-full" style="display: none;"> The increasing administrative burden of medical documentation, particularly through Electronic Health Records (EHR), significantly reduces the time available for direct patient care and contributes to physician burnout. To address this issue, we propose MediNotes, an advanced generative AI framework designed to automate the creation of SOAP (Subjective, Objective, Assessment, Plan) notes from medical conversations. MediNotes integrates Large Language Models (LLMs), Retrieval-Augmented Generation (RAG), and Automatic Speech Recognition (ASR) to capture and process both text and voice inputs in real time or from recorded audio, generating structured and contextually accurate medical notes. The framework also incorporates advanced techniques like Quantized Low-Rank Adaptation (QLoRA) and Parameter-Efficient Fine-Tuning (PEFT) for efficient model fine-tuning in resource-constrained environments. Additionally, MediNotes offers a query-based retrieval system, allowing healthcare providers and patients to access relevant medical information quickly and accurately. Evaluations using the ACI-BENCH dataset demonstrate that MediNotes significantly improves the accuracy, efficiency, and usability of automated medical documentation, offering a robust solution to reduce the administrative burden on healthcare professionals while improving the quality of clinical workflows. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01841v1-abstract-full').style.display = 'none'; document.getElementById('2410.01841v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 Figures, 7 page, IEEE standard research paper</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.00392">arXiv:2410.00392</a> <span> [<a href="https://arxiv.org/pdf/2410.00392">pdf</a>, <a href="https://arxiv.org/format/2410.00392">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> </div> </div> <p class="title is-5 mathjax"> MERIT: Multimodal Wearable Vital Sign Waveform Monitoring </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Tang%2C+Y">Yongyang Tang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Z">Zhe Chen</a>, <a href="/search/eess?searchtype=author&query=Li%2C+A">Ang Li</a>, <a href="/search/eess?searchtype=author&query=Zheng%2C+T">Tianyue Zheng</a>, <a href="/search/eess?searchtype=author&query=Lin%2C+Z">Zheng Lin</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+J">Jia Xu</a>, <a href="/search/eess?searchtype=author&query=Lv%2C+P">Pin Lv</a>, <a href="/search/eess?searchtype=author&query=Sun%2C+Z">Zhe Sun</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+Y">Yue Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.00392v3-abstract-short" style="display: inline;"> Cardiovascular disease (CVD) is the leading cause of death and premature mortality worldwide, with occupational environments significantly influencing CVD risk, underscoring the need for effective cardiac monitoring and early warning systems. Existing methods of monitoring vital signs require subjects to remain stationary, which is impractical for daily monitoring as individuals are often in motio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00392v3-abstract-full').style.display = 'inline'; document.getElementById('2410.00392v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.00392v3-abstract-full" style="display: none;"> Cardiovascular disease (CVD) is the leading cause of death and premature mortality worldwide, with occupational environments significantly influencing CVD risk, underscoring the need for effective cardiac monitoring and early warning systems. Existing methods of monitoring vital signs require subjects to remain stationary, which is impractical for daily monitoring as individuals are often in motion. To address this limitation, we propose MERIT, a multimodality-based wearable system designed for precise ECG waveform monitoring without movement restrictions. Daily activities, involving frequent arm movements, can significantly affect sensor data and complicate the reconstruction of accurate ECG signals. To mitigate motion impact and enhance ECG signal reconstruction, we introduce a deep independent component analysis (Deep-ICA) module and a multimodal fusion module. We conducted experiments with 15 subjects. Our results, compared with commercial wearable devices and existing methods, demonstrate that MERIT accurately reconstructs ECG waveforms during various office activities, offering a reliable solution for fine-grained cardiac monitoring in dynamic environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00392v3-abstract-full').style.display = 'none'; document.getElementById('2410.00392v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 10 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.20485">arXiv:2409.20485</a> <span> [<a href="https://arxiv.org/pdf/2409.20485">pdf</a>, <a href="https://arxiv.org/format/2409.20485">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Movable Antennas Enabled Wireless-Powered NOMA: Continuous and Discrete Positioning Designs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Gao%2C+Y">Ying Gao</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+Q">Qingqing Wu</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+W">Wen Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.20485v1-abstract-short" style="display: inline;"> This paper investigates a movable antenna (MA)-enabled wireless-powered communication network (WPCN), where multiple wireless devices (WDs) first harvest energy from the downlink (DL) signal broadcast by a hybrid access point (HAP) and then transmit information in the uplink (UL) using non-orthogonal multiple access. Unlike conventional WPCNs with fixed-position antennas (FPAs), this MA-enabled WP… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.20485v1-abstract-full').style.display = 'inline'; document.getElementById('2409.20485v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.20485v1-abstract-full" style="display: none;"> This paper investigates a movable antenna (MA)-enabled wireless-powered communication network (WPCN), where multiple wireless devices (WDs) first harvest energy from the downlink (DL) signal broadcast by a hybrid access point (HAP) and then transmit information in the uplink (UL) using non-orthogonal multiple access. Unlike conventional WPCNs with fixed-position antennas (FPAs), this MA-enabled WPCN allows the MAs at the HAP and the WDs to adjust their positions twice: once before DL wireless power transfer and once before DL wireless information transmission. Our goal is to maximize the system sum throughput by jointly optimizing the MA positions, the time allocation, and the UL power allocation. Considering the characteristics of antenna movement, we explore both continuous and discrete positioning designs, which, after formulation, are found to be non-convex optimization problems. Before tackling these problems, we rigorously prove that using identical MA positions for both DL and UL is the optimal strategy in both scenarios, thereby greatly simplifying the problems and enabling easier practical implementation of the system. We then propose alternating optimization-based algorithms for the resulting simplified problems. Simulation results show that: 1) the proposed continuous MA scheme can enhance the sum throughput by up to 395.71% compared to the benchmark with FPAs, even when additional compensation transmission time is provided to the latter; 2) a step size of one-quarter wavelength for the MA motion driver is generally sufficient for the proposed discrete MA scheme to achieve over 80% of the sum throughput performance of the continuous MA scheme; 3) when each moving region is large enough to include multiple optimal positions for the continuous MA scheme, the discrete MA scheme can achieve comparable sum throughput without requiring an excessively small step size. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.20485v1-abstract-full').style.display = 'none'; document.getElementById('2409.20485v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 10 figures (subfigures included), submitted to an IEEE journal for possible publication</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.14738">arXiv:2409.14738</a> <span> [<a href="https://arxiv.org/pdf/2409.14738">pdf</a>, <a href="https://arxiv.org/format/2409.14738">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Enabling On-Chip High-Frequency Adaptive Linear Optimal Control via Linearized Gaussian Process </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Gao%2C+Y">Yuan Gao</a>, <a href="/search/eess?searchtype=author&query=Lai%2C+Y">Yinyi Lai</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+J">Jun Wang</a>, <a href="/search/eess?searchtype=author&query=Fang%2C+Y">Yini Fang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.14738v2-abstract-short" style="display: inline;"> Unpredictable and complex aerodynamic effects pose significant challenges to achieving precise flight control, such as the downwash effect from upper vehicles to lower ones. Conventional methods often struggle to accurately model these interactions, leading to controllers that require large safety margins between vehicles. Moreover, the controller on real drones usually requires high-frequency and… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14738v2-abstract-full').style.display = 'inline'; document.getElementById('2409.14738v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.14738v2-abstract-full" style="display: none;"> Unpredictable and complex aerodynamic effects pose significant challenges to achieving precise flight control, such as the downwash effect from upper vehicles to lower ones. Conventional methods often struggle to accurately model these interactions, leading to controllers that require large safety margins between vehicles. Moreover, the controller on real drones usually requires high-frequency and has limited on-chip computation, making the adaptive control design more difficult to implement. To address these challenges, we incorporate Gaussian process (GP) to model the adaptive external aerodynamics with linear model predictive control. The GP is linearized to enable real-time high-frequency solutions. Moreover, to handle the error caused by linearization, we integrate end-to-end Bayesian optimization during sample collection stages to improve the control performance. Experimental results on both simulations and real quadrotors show that we can achieve real-time solvable computation speed with acceptable tracking errors. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14738v2-abstract-full').style.display = 'none'; document.getElementById('2409.14738v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.12007">arXiv:2409.12007</a> <span> [<a href="https://arxiv.org/pdf/2409.12007">pdf</a>, <a href="https://arxiv.org/format/2409.12007">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Real-Time-Feasible Collision-Free Motion Planning For Ellipsoidal Objects </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Gao%2C+Y">Yunfan Gao</a>, <a href="/search/eess?searchtype=author&query=Messerer%2C+F">Florian Messerer</a>, <a href="/search/eess?searchtype=author&query=van+Duijkeren%2C+N">Niels van Duijkeren</a>, <a href="/search/eess?searchtype=author&query=Houska%2C+B">Boris Houska</a>, <a href="/search/eess?searchtype=author&query=Diehl%2C+M">Moritz Diehl</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.12007v1-abstract-short" style="display: inline;"> Online planning of collision-free trajectories is a fundamental task for robotics and self-driving car applications. This paper revisits collision avoidance between ellipsoidal objects using differentiable constraints. Two ellipsoids do not overlap if and only if the endpoint of the vector between the center points of the ellipsoids does not lie in the interior of the Minkowski sum of the ellipsoi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12007v1-abstract-full').style.display = 'inline'; document.getElementById('2409.12007v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.12007v1-abstract-full" style="display: none;"> Online planning of collision-free trajectories is a fundamental task for robotics and self-driving car applications. This paper revisits collision avoidance between ellipsoidal objects using differentiable constraints. Two ellipsoids do not overlap if and only if the endpoint of the vector between the center points of the ellipsoids does not lie in the interior of the Minkowski sum of the ellipsoids. This condition is formulated using a parametric over-approximation of the Minkowski sum, which can be made tight in any given direction. The resulting collision avoidance constraint is included in an optimal control problem (OCP) and evaluated in comparison to the separating-hyperplane approach. Not only do we observe that the Minkowski-sum formulation is computationally more efficient in our experiments, but also that using pre-determined over-approximation parameters based on warm-start trajectories leads to a very limited increase in suboptimality. This gives rise to a novel real-time scheme for collision-free motion planning with model predictive control (MPC). Both the real-time feasibility and the effectiveness of the constraint formulation are demonstrated in challenging real-world experiments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12007v1-abstract-full').style.display = 'none'; document.getElementById('2409.12007v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.09754">arXiv:2409.09754</a> <span> [<a href="https://arxiv.org/pdf/2409.09754">pdf</a>, <a href="https://arxiv.org/format/2409.09754">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optics">physics.optics</span> </div> </div> <p class="title is-5 mathjax"> Towards Single-Lens Controllable Depth-of-Field Imaging via All-in-Focus Aberration Correction and Monocular Depth Estimation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Qian%2C+X">Xiaolong Qian</a>, <a href="/search/eess?searchtype=author&query=Jiang%2C+Q">Qi Jiang</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+Y">Yao Gao</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+S">Shaohua Gao</a>, <a href="/search/eess?searchtype=author&query=Yi%2C+Z">Zhonghua Yi</a>, <a href="/search/eess?searchtype=author&query=Sun%2C+L">Lei Sun</a>, <a href="/search/eess?searchtype=author&query=Wei%2C+K">Kai Wei</a>, <a href="/search/eess?searchtype=author&query=Li%2C+H">Haifeng Li</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+K">Kailun Yang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+K">Kaiwei Wang</a>, <a href="/search/eess?searchtype=author&query=Bai%2C+J">Jian Bai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.09754v1-abstract-short" style="display: inline;"> Controllable Depth-of-Field (DoF) imaging commonly produces amazing visual effects based on heavy and expensive high-end lenses. However, confronted with the increasing demand for mobile scenarios, it is desirable to achieve a lightweight solution with Minimalist Optical Systems (MOS). This work centers around two major limitations of MOS, i.e., the severe optical aberrations and uncontrollable Do… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09754v1-abstract-full').style.display = 'inline'; document.getElementById('2409.09754v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.09754v1-abstract-full" style="display: none;"> Controllable Depth-of-Field (DoF) imaging commonly produces amazing visual effects based on heavy and expensive high-end lenses. However, confronted with the increasing demand for mobile scenarios, it is desirable to achieve a lightweight solution with Minimalist Optical Systems (MOS). This work centers around two major limitations of MOS, i.e., the severe optical aberrations and uncontrollable DoF, for achieving single-lens controllable DoF imaging via computational methods. A Depth-aware Controllable DoF Imaging (DCDI) framework is proposed equipped with All-in-Focus (AiF) aberration correction and monocular depth estimation, where the recovered image and corresponding depth map are utilized to produce imaging results under diverse DoFs of any high-end lens via patch-wise convolution. To address the depth-varying optical degradation, we introduce a Depth-aware Degradation-adaptive Training (DA2T) scheme. At the dataset level, a Depth-aware Aberration MOS (DAMOS) dataset is established based on the simulation of Point Spread Functions (PSFs) under different object distances. Additionally, we design two plug-and-play depth-aware mechanisms to embed depth information into the aberration image recovery for better tackling depth-aware degradation. Furthermore, we propose a storage-efficient Omni-Lens-Field model to represent the 4D PSF library of various lenses. With the predicted depth map, recovered image, and depth-aware PSF map inferred by Omni-Lens-Field, single-lens controllable DoF imaging is achieved. Comprehensive experimental results demonstrate that the proposed framework enhances the recovery performance, and attains impressive single-lens controllable DoF imaging results, providing a seminal baseline for this field. The source code and the established dataset will be publicly available at https://github.com/XiaolongQian/DCDI. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09754v1-abstract-full').style.display = 'none'; document.getElementById('2409.09754v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The source code and the established dataset will be publicly available at https://github.com/XiaolongQian/DCDI</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.08371">arXiv:2409.08371</a> <span> [<a href="https://arxiv.org/pdf/2409.08371">pdf</a>, <a href="https://arxiv.org/format/2409.08371">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Time-Varying Foot-Placement Control for Underactuated Humanoid Walking on Swaying Rigid Surfaces </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Gao%2C+Y">Yuan Gao</a>, <a href="/search/eess?searchtype=author&query=Paredes%2C+V">Victor Paredes</a>, <a href="/search/eess?searchtype=author&query=Gong%2C+Y">Yukai Gong</a>, <a href="/search/eess?searchtype=author&query=He%2C+Z">Zijian He</a>, <a href="/search/eess?searchtype=author&query=Hereid%2C+A">Ayonga Hereid</a>, <a href="/search/eess?searchtype=author&query=Gu%2C+Y">Yan Gu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.08371v1-abstract-short" style="display: inline;"> Locomotion on dynamic rigid surface (i.e., rigid surface accelerating in an inertial frame) presents complex challenges for controller design, which are essential for deploying humanoid robots in dynamic real-world environments such as moving trains, ships, and airplanes. This paper introduces a real-time, provably stabilizing control approach for underactuated humanoid walking on periodically swa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08371v1-abstract-full').style.display = 'inline'; document.getElementById('2409.08371v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.08371v1-abstract-full" style="display: none;"> Locomotion on dynamic rigid surface (i.e., rigid surface accelerating in an inertial frame) presents complex challenges for controller design, which are essential for deploying humanoid robots in dynamic real-world environments such as moving trains, ships, and airplanes. This paper introduces a real-time, provably stabilizing control approach for underactuated humanoid walking on periodically swaying rigid surface. The first key contribution is the analytical extension of the classical angular momentum-based linear inverted pendulum model from static to swaying grounds. This extension results in a time-varying, nonhomogeneous robot model, which is fundamentally different from the existing pendulum models. We synthesize a discrete footstep control law for the model and derive a new set of sufficient stability conditions that verify the controller's stabilizing effect. Another key contribution is the development of a hierarchical control framework that incorporates the proposed footstep control law as its higher-layer planner to ensure the stability of underactuated walking. The closed-loop stability of the complete hybrid, full-order robot dynamics under this control framework is provably analyzed based on nonlinear control theory. Finally, experiments conducted on a Digit humanoid robot, both in simulations and with hardware, demonstrate the framework's effectiveness in addressing underactuated bipedal locomotion on swaying ground, even in the presence of uncertain surface motions and unknown external pushes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08371v1-abstract-full').style.display = 'none'; document.getElementById('2409.08371v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">20 pages, 18 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.06887">arXiv:2409.06887</a> <span> [<a href="https://arxiv.org/pdf/2409.06887">pdf</a>, <a href="https://arxiv.org/format/2409.06887">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Ordinal Learning: Longitudinal Attention Alignment Model for Predicting Time to Future Breast Cancer Events from Mammograms </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+X">Xin Wang</a>, <a href="/search/eess?searchtype=author&query=Tan%2C+T">Tao Tan</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+Y">Yuan Gao</a>, <a href="/search/eess?searchtype=author&query=Marcus%2C+E">Eric Marcus</a>, <a href="/search/eess?searchtype=author&query=Han%2C+L">Luyi Han</a>, <a href="/search/eess?searchtype=author&query=Portaluri%2C+A">Antonio Portaluri</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+T">Tianyu Zhang</a>, <a href="/search/eess?searchtype=author&query=Lu%2C+C">Chunyao Lu</a>, <a href="/search/eess?searchtype=author&query=Liang%2C+X">Xinglong Liang</a>, <a href="/search/eess?searchtype=author&query=Beets-Tan%2C+R">Regina Beets-Tan</a>, <a href="/search/eess?searchtype=author&query=Teuwen%2C+J">Jonas Teuwen</a>, <a href="/search/eess?searchtype=author&query=Mann%2C+R">Ritse Mann</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.06887v1-abstract-short" style="display: inline;"> Precision breast cancer (BC) risk assessment is crucial for developing individualized screening and prevention. Despite the promising potential of recent mammogram (MG) based deep learning models in predicting BC risk, they mostly overlook the 'time-to-future-event' ordering among patients and exhibit limited explorations into how they track history changes in breast tissue, thereby limiting their… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.06887v1-abstract-full').style.display = 'inline'; document.getElementById('2409.06887v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.06887v1-abstract-full" style="display: none;"> Precision breast cancer (BC) risk assessment is crucial for developing individualized screening and prevention. Despite the promising potential of recent mammogram (MG) based deep learning models in predicting BC risk, they mostly overlook the 'time-to-future-event' ordering among patients and exhibit limited explorations into how they track history changes in breast tissue, thereby limiting their clinical application. In this work, we propose a novel method, named OA-BreaCR, to precisely model the ordinal relationship of the time to and between BC events while incorporating longitudinal breast tissue changes in a more explainable manner. We validate our method on public EMBED and inhouse datasets, comparing with existing BC risk prediction and time prediction methods. Our ordinal learning method OA-BreaCR outperforms existing methods in both BC risk and time-to-future-event prediction tasks. Additionally, ordinal heatmap visualizations show the model's attention over time. Our findings underscore the importance of interpretable and precise risk assessment for enhancing BC screening and prevention efforts. The code will be accessible to the public. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.06887v1-abstract-full').style.display = 'none'; document.getElementById('2409.06887v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.05809">arXiv:2409.05809</a> <span> [<a href="https://arxiv.org/pdf/2409.05809">pdf</a>, <a href="https://arxiv.org/format/2409.05809">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Optics">physics.optics</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> A Flexible Framework for Universal Computational Aberration Correction via Automatic Lens Library Generation and Domain Adaptation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Jiang%2C+Q">Qi Jiang</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+Y">Yao Gao</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+S">Shaohua Gao</a>, <a href="/search/eess?searchtype=author&query=Yi%2C+Z">Zhonghua Yi</a>, <a href="/search/eess?searchtype=author&query=Sun%2C+L">Lei Sun</a>, <a href="/search/eess?searchtype=author&query=Shi%2C+H">Hao Shi</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+K">Kailun Yang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+K">Kaiwei Wang</a>, <a href="/search/eess?searchtype=author&query=Bai%2C+J">Jian Bai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.05809v1-abstract-short" style="display: inline;"> Emerging universal Computational Aberration Correction (CAC) paradigms provide an inspiring solution to light-weight and high-quality imaging without repeated data preparation and model training to accommodate new lens designs. However, the training databases in these approaches, i.e., the lens libraries (LensLibs), suffer from their limited coverage of real-world aberration behaviors. In this wor… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05809v1-abstract-full').style.display = 'inline'; document.getElementById('2409.05809v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.05809v1-abstract-full" style="display: none;"> Emerging universal Computational Aberration Correction (CAC) paradigms provide an inspiring solution to light-weight and high-quality imaging without repeated data preparation and model training to accommodate new lens designs. However, the training databases in these approaches, i.e., the lens libraries (LensLibs), suffer from their limited coverage of real-world aberration behaviors. In this work, we set up an OmniLens framework for universal CAC, considering both the generalization ability and flexibility. OmniLens extends the idea of universal CAC to a broader concept, where a base model is trained for three cases, including zero-shot CAC with the pre-trained model, few-shot CAC with a little lens-specific data for fine-tuning, and domain adaptive CAC using domain adaptation for lens-descriptions-unknown lens. In terms of OmniLens's data foundation, we first propose an Evolution-based Automatic Optical Design (EAOD) pipeline to construct LensLib automatically, coined AODLib, whose diversity is enriched by an evolution framework, with comprehensive constraints and a hybrid optimization strategy for achieving realistic aberration behaviors. For network design, we introduce the guidance of high-quality codebook priors to facilitate zero-shot CAC and few-shot CAC, which enhances the model's generalization ability, while also boosting its convergence in a few-shot case. Furthermore, based on the statistical observation of dark channel priors in optical degradation, we design an unsupervised regularization term to adapt the base model to the target descriptions-unknown lens using its aberration images without ground truth. We validate OmniLens on 4 manually designed low-end lenses with various structures and aberration behaviors. Remarkably, the base model trained on AODLib exhibits strong generalization capabilities, achieving 97% of the lens-specific performance in a zero-shot setting. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05809v1-abstract-full').style.display = 'none'; document.getElementById('2409.05809v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.00815">arXiv:2409.00815</a> <span> [<a href="https://arxiv.org/pdf/2409.00815">pdf</a>, <a href="https://arxiv.org/format/2409.00815">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Serialized Speech Information Guidance with Overlapped Encoding Separation for Multi-Speaker Automatic Speech Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Shi%2C+H">Hao Shi</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+Y">Yuan Gao</a>, <a href="/search/eess?searchtype=author&query=Ni%2C+Z">Zhaoheng Ni</a>, <a href="/search/eess?searchtype=author&query=Kawahara%2C+T">Tatsuya Kawahara</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.00815v3-abstract-short" style="display: inline;"> Serialized output training (SOT) attracts increasing attention due to its convenience and flexibility for multi-speaker automatic speech recognition (ASR). However, it is not easy to train with attention loss only. In this paper, we propose the overlapped encoding separation (EncSep) to fully utilize the benefits of the connectionist temporal classification (CTC) and attention hybrid loss. This ad… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00815v3-abstract-full').style.display = 'inline'; document.getElementById('2409.00815v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.00815v3-abstract-full" style="display: none;"> Serialized output training (SOT) attracts increasing attention due to its convenience and flexibility for multi-speaker automatic speech recognition (ASR). However, it is not easy to train with attention loss only. In this paper, we propose the overlapped encoding separation (EncSep) to fully utilize the benefits of the connectionist temporal classification (CTC) and attention hybrid loss. This additional separator is inserted after the encoder to extract the multi-speaker information with CTC losses. Furthermore, we propose the serialized speech information guidance SOT (GEncSep) to further utilize the separated encodings. The separated streams are concatenated to provide single-speaker information to guide attention during decoding. The experimental results on LibriMix show that the single-speaker encoding can be separated from the overlapped encoding. The CTC loss helps to improve the encoder representation under complex scenarios. GEncSep further improved performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00815v3-abstract-full').style.display = 'none'; document.getElementById('2409.00815v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.17073">arXiv:2408.17073</a> <span> [<a href="https://arxiv.org/pdf/2408.17073">pdf</a>, <a href="https://arxiv.org/format/2408.17073">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Approximately Invertible Neural Network for Learned Image Compression </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Gao%2C+Y">Yanbo Gao</a>, <a href="/search/eess?searchtype=author&query=Fu%2C+M">Meng Fu</a>, <a href="/search/eess?searchtype=author&query=Li%2C+S">Shuai Li</a>, <a href="/search/eess?searchtype=author&query=Lv%2C+C">Chong Lv</a>, <a href="/search/eess?searchtype=author&query=Cai%2C+X">Xun Cai</a>, <a href="/search/eess?searchtype=author&query=Yuan%2C+H">Hui Yuan</a>, <a href="/search/eess?searchtype=author&query=Ye%2C+M">Mao Ye</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.17073v1-abstract-short" style="display: inline;"> Learned image compression have attracted considerable interests in recent years. It typically comprises an analysis transform, a synthesis transform, quantization and an entropy coding model. The analysis transform and synthesis transform are used to encode an image to latent feature and decode the quantized feature to reconstruct the image, and can be regarded as coupled transforms. However, the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.17073v1-abstract-full').style.display = 'inline'; document.getElementById('2408.17073v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.17073v1-abstract-full" style="display: none;"> Learned image compression have attracted considerable interests in recent years. It typically comprises an analysis transform, a synthesis transform, quantization and an entropy coding model. The analysis transform and synthesis transform are used to encode an image to latent feature and decode the quantized feature to reconstruct the image, and can be regarded as coupled transforms. However, the analysis transform and synthesis transform are designed independently in the existing methods, making them unreliable in high-quality image compression. Inspired by the invertible neural networks in generative modeling, invertible modules are used to construct the coupled analysis and synthesis transforms. Considering the noise introduced in the feature quantization invalidates the invertible process, this paper proposes an Approximately Invertible Neural Network (A-INN) framework for learned image compression. It formulates the rate-distortion optimization in lossy image compression when using INN with quantization, which differentiates from using INN for generative modelling. Generally speaking, A-INN can be used as the theoretical foundation for any INN based lossy compression method. Based on this formulation, A-INN with a progressive denoising module (PDM) is developed to effectively reduce the quantization noise in the decoding. Moreover, a Cascaded Feature Recovery Module (CFRM) is designed to learn high-dimensional feature recovery from low-dimensional ones to further reduce the noise in feature channel compression. In addition, a Frequency-enhanced Decomposition and Synthesis Module (FDSM) is developed by explicitly enhancing the high-frequency components in an image to address the loss of high-frequency information inherent in neural network based image compression. Extensive experiments demonstrate that the proposed A-INN outperforms the existing learned image compression methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.17073v1-abstract-full').style.display = 'none'; document.getElementById('2408.17073v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.13056">arXiv:2408.13056</a> <span> [<a href="https://arxiv.org/pdf/2408.13056">pdf</a>, <a href="https://arxiv.org/format/2408.13056">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> GNSS Interference Classification Using Federated Reservoir Computing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ye%2C+Z">Ziqiang Ye</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+Y">Yulan Gao</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+X">Xinyue Liu</a>, <a href="/search/eess?searchtype=author&query=Xiao%2C+Y">Yue Xiao</a>, <a href="/search/eess?searchtype=author&query=Xiao%2C+M">Ming Xiao</a>, <a href="/search/eess?searchtype=author&query=Zammit%2C+S">Saviour Zammit</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.13056v1-abstract-short" style="display: inline;"> The expanding use of Unmanned Aerial Vehicles (UAVs) in vital areas like traffic management, surveillance, and environmental monitoring highlights the need for robust communication and navigation systems. Particularly vulnerable are Global Navigation Satellite Systems (GNSS), which face a spectrum of interference and jamming threats that can significantly undermine their performance. While traditi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.13056v1-abstract-full').style.display = 'inline'; document.getElementById('2408.13056v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.13056v1-abstract-full" style="display: none;"> The expanding use of Unmanned Aerial Vehicles (UAVs) in vital areas like traffic management, surveillance, and environmental monitoring highlights the need for robust communication and navigation systems. Particularly vulnerable are Global Navigation Satellite Systems (GNSS), which face a spectrum of interference and jamming threats that can significantly undermine their performance. While traditional deep learning approaches are adept at mitigating these issues, they often fall short for UAV applications due to significant computational demands and the complexities of managing large, centralized datasets. In response, this paper introduces Federated Reservoir Computing (FedRC) as a potent and efficient solution tailored to enhance interference classification in GNSS systems used by UAVs. Our experimental results demonstrate that FedRC not only achieves faster convergence but also sustains lower loss levels than traditional models, highlighting its exceptional adaptability and operational efficiency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.13056v1-abstract-full').style.display = 'none'; document.getElementById('2408.13056v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.07325">arXiv:2408.07325</a> <span> [<a href="https://arxiv.org/pdf/2408.07325">pdf</a>, <a href="https://arxiv.org/format/2408.07325">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> RoCoSDF: Row-Column Scanned Neural Signed Distance Fields for Freehand 3D Ultrasound Imaging Shape Reconstruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Chen%2C+H">Hongbo Chen</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+Y">Yuchong Gao</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+S">Shuhang Zhang</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+J">Jiangjie Wu</a>, <a href="/search/eess?searchtype=author&query=Ma%2C+Y">Yuexin Ma</a>, <a href="/search/eess?searchtype=author&query=Zheng%2C+R">Rui Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.07325v1-abstract-short" style="display: inline;"> The reconstruction of high-quality shape geometry is crucial for developing freehand 3D ultrasound imaging. However, the shape reconstruction of multi-view ultrasound data remains challenging due to the elevation distortion caused by thick transducer probes. In this paper, we present a novel learning-based framework RoCoSDF, which can effectively generate an implicit surface through continuous sha… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.07325v1-abstract-full').style.display = 'inline'; document.getElementById('2408.07325v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.07325v1-abstract-full" style="display: none;"> The reconstruction of high-quality shape geometry is crucial for developing freehand 3D ultrasound imaging. However, the shape reconstruction of multi-view ultrasound data remains challenging due to the elevation distortion caused by thick transducer probes. In this paper, we present a novel learning-based framework RoCoSDF, which can effectively generate an implicit surface through continuous shape representations derived from row-column scanned datasets. In RoCoSDF, we encode the datasets from different views into the corresponding neural signed distance function (SDF) and then operate all SDFs in a normalized 3D space to restore the actual surface contour. Without requiring pre-training on large-scale ground truth shapes, our approach can synthesize a smooth and continuous signed distance field from multi-view SDFs to implicitly represent the actual geometry. Furthermore, two regularizers are introduced to facilitate shape refinement by constraining the SDF near the surface. The experiments on twelve shapes data acquired by two ultrasound transducer probes validate that RoCoSDF can effectively reconstruct accurate geometric shapes from multi-view ultrasound data, which outperforms current reconstruction methods. Code is available at https://github.com/chenhbo/RoCoSDF. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.07325v1-abstract-full').style.display = 'none'; document.getElementById('2408.07325v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by MICCAI 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.19704">arXiv:2407.19704</a> <span> [<a href="https://arxiv.org/pdf/2407.19704">pdf</a>, <a href="https://arxiv.org/format/2407.19704">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> UNQA: Unified No-Reference Quality Assessment for Audio, Image, Video, and Audio-Visual Content </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Cao%2C+Y">Yuqin Cao</a>, <a href="/search/eess?searchtype=author&query=Min%2C+X">Xiongkuo Min</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+Y">Yixuan Gao</a>, <a href="/search/eess?searchtype=author&query=Sun%2C+W">Wei Sun</a>, <a href="/search/eess?searchtype=author&query=Lin%2C+W">Weisi Lin</a>, <a href="/search/eess?searchtype=author&query=Zhai%2C+G">Guangtao Zhai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.19704v1-abstract-short" style="display: inline;"> As multimedia data flourishes on the Internet, quality assessment (QA) of multimedia data becomes paramount for digital media applications. Since multimedia data includes multiple modalities including audio, image, video, and audio-visual (A/V) content, researchers have developed a range of QA methods to evaluate the quality of different modality data. While they exclusively focus on addressing th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.19704v1-abstract-full').style.display = 'inline'; document.getElementById('2407.19704v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.19704v1-abstract-full" style="display: none;"> As multimedia data flourishes on the Internet, quality assessment (QA) of multimedia data becomes paramount for digital media applications. Since multimedia data includes multiple modalities including audio, image, video, and audio-visual (A/V) content, researchers have developed a range of QA methods to evaluate the quality of different modality data. While they exclusively focus on addressing the single modality QA issues, a unified QA model that can handle diverse media across multiple modalities is still missing, whereas the latter can better resemble human perception behaviour and also have a wider range of applications. In this paper, we propose the Unified No-reference Quality Assessment model (UNQA) for audio, image, video, and A/V content, which tries to train a single QA model across different media modalities. To tackle the issue of inconsistent quality scales among different QA databases, we develop a multi-modality strategy to jointly train UNQA on multiple QA databases. Based on the input modality, UNQA selectively extracts the spatial features, motion features, and audio features, and calculates a final quality score via the four corresponding modality regression modules. Compared with existing QA methods, UNQA has two advantages: 1) the multi-modality training strategy makes the QA model learn more general and robust quality-aware feature representation as evidenced by the superior performance of UNQA compared to state-of-the-art QA methods. 2) UNQA reduces the number of models required to assess multimedia data across different modalities. and is friendly to deploy to practical applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.19704v1-abstract-full').style.display = 'none'; document.getElementById('2407.19704v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.14220">arXiv:2407.14220</a> <span> [<a href="https://arxiv.org/pdf/2407.14220">pdf</a>, <a href="https://arxiv.org/format/2407.14220">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Stochastic Model Predictive Control with Optimal Linear Feedback for Mobile Robots in Dynamic Environments </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Gao%2C+Y">Yunfan Gao</a>, <a href="/search/eess?searchtype=author&query=Messerer%2C+F">Florian Messerer</a>, <a href="/search/eess?searchtype=author&query=van+Duijkeren%2C+N">Niels van Duijkeren</a>, <a href="/search/eess?searchtype=author&query=Diehl%2C+M">Moritz Diehl</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.14220v1-abstract-short" style="display: inline;"> Robot navigation around humans can be a challenging problem since human movements are hard to predict. Stochastic model predictive control (MPC) can account for such uncertainties and approximately bound the probability of a collision to take place. In this paper, to counteract the rapidly growing human motion uncertainty over time, we incorporate state feedback in the stochastic MPC. This allows… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.14220v1-abstract-full').style.display = 'inline'; document.getElementById('2407.14220v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.14220v1-abstract-full" style="display: none;"> Robot navigation around humans can be a challenging problem since human movements are hard to predict. Stochastic model predictive control (MPC) can account for such uncertainties and approximately bound the probability of a collision to take place. In this paper, to counteract the rapidly growing human motion uncertainty over time, we incorporate state feedback in the stochastic MPC. This allows the robot to more closely track reference trajectories. To this end the feedback policy is left as a degree of freedom in the optimal control problem. The stochastic MPC with feedback is validated in simulation experiments and is compared against nominal MPC and stochastic MPC without feedback. The added computation time can be limited by reducing the number of additional variables for the feedback law with a small compromise in control performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.14220v1-abstract-full').style.display = 'none'; document.getElementById('2407.14220v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.13210">arXiv:2407.13210</a> <span> [<a href="https://arxiv.org/pdf/2407.13210">pdf</a>, <a href="https://arxiv.org/format/2407.13210">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Improved Esophageal Varices Assessment from Non-Contrast CT Scans </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+C">Chunli Li</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+X">Xiaoming Zhang</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+Y">Yuan Gao</a>, <a href="/search/eess?searchtype=author&query=Yin%2C+X">Xiaoli Yin</a>, <a href="/search/eess?searchtype=author&query=Lu%2C+L">Le Lu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+L">Ling Zhang</a>, <a href="/search/eess?searchtype=author&query=Yan%2C+K">Ke Yan</a>, <a href="/search/eess?searchtype=author&query=Shi%2C+Y">Yu Shi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.13210v1-abstract-short" style="display: inline;"> Esophageal varices (EV), a serious health concern resulting from portal hypertension, are traditionally diagnosed through invasive endoscopic procedures. Despite non-contrast computed tomography (NC-CT) imaging being a less expensive and non-invasive imaging modality, it has yet to gain full acceptance as a primary clinical diagnostic tool for EV evaluation. To overcome existing diagnostic challen… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.13210v1-abstract-full').style.display = 'inline'; document.getElementById('2407.13210v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.13210v1-abstract-full" style="display: none;"> Esophageal varices (EV), a serious health concern resulting from portal hypertension, are traditionally diagnosed through invasive endoscopic procedures. Despite non-contrast computed tomography (NC-CT) imaging being a less expensive and non-invasive imaging modality, it has yet to gain full acceptance as a primary clinical diagnostic tool for EV evaluation. To overcome existing diagnostic challenges, we present the Multi-Organ-cOhesion-Network (MOON), a novel framework enhancing the analysis of critical organ features in NC-CT scans for effective assessment of EV. Drawing inspiration from the thorough assessment practices of radiologists, MOON establishes a cohesive multiorgan analysis model that unifies the imaging features of the related organs of EV, namely esophagus, liver, and spleen. This integration significantly increases the diagnostic accuracy for EV. We have compiled an extensive NC-CT dataset of 1,255 patients diagnosed with EV, spanning three grades of severity. Each case is corroborated by endoscopic diagnostic results. The efficacy of MOON has been substantiated through a validation process involving multi-fold cross-validation on 1,010 cases and an independent test on 245 cases, exhibiting superior diagnostic performance compared to methods focusing solely on the esophagus (for classifying severe grade: AUC of 0.864 versus 0.803, and for moderate to severe grades: AUC of 0.832 versus 0.793). To our knowledge, MOON is the first work to incorporate a synchronized multi-organ NC-CT analysis for EV assessment, providing a more acceptable and minimally invasive alternative for patients compared to traditional endoscopy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.13210v1-abstract-full').style.display = 'none'; document.getElementById('2407.13210v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Early accepted to MICCAI 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.12038">arXiv:2407.12038</a> <span> [<a href="https://arxiv.org/pdf/2407.12038">pdf</a>, <a href="https://arxiv.org/ps/2407.12038">ps</a>, <a href="https://arxiv.org/format/2407.12038">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> ICAGC 2024: Inspirational and Convincing Audio Generation Challenge 2024 </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Fu%2C+R">Ruibo Fu</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+R">Rui Liu</a>, <a href="/search/eess?searchtype=author&query=Qiang%2C+C">Chunyu Qiang</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+Y">Yingming Gao</a>, <a href="/search/eess?searchtype=author&query=Lu%2C+Y">Yi Lu</a>, <a href="/search/eess?searchtype=author&query=Shi%2C+S">Shuchen Shi</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+T">Tao Wang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Ya Li</a>, <a href="/search/eess?searchtype=author&query=Wen%2C+Z">Zhengqi Wen</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+C">Chen Zhang</a>, <a href="/search/eess?searchtype=author&query=Bu%2C+H">Hui Bu</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+Y">Yukun Liu</a>, <a href="/search/eess?searchtype=author&query=Qi%2C+X">Xin Qi</a>, <a href="/search/eess?searchtype=author&query=Li%2C+G">Guanjun Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.12038v2-abstract-short" style="display: inline;"> The Inspirational and Convincing Audio Generation Challenge 2024 (ICAGC 2024) is part of the ISCSLP 2024 Competitions and Challenges track. While current text-to-speech (TTS) technology can generate high-quality audio, its ability to convey complex emotions and controlled detail content remains limited. This constraint leads to a discrepancy between the generated audio and human subjective percept… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.12038v2-abstract-full').style.display = 'inline'; document.getElementById('2407.12038v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.12038v2-abstract-full" style="display: none;"> The Inspirational and Convincing Audio Generation Challenge 2024 (ICAGC 2024) is part of the ISCSLP 2024 Competitions and Challenges track. While current text-to-speech (TTS) technology can generate high-quality audio, its ability to convey complex emotions and controlled detail content remains limited. This constraint leads to a discrepancy between the generated audio and human subjective perception in practical applications like companion robots for children and marketing bots. The core issue lies in the inconsistency between high-quality audio generation and the ultimate human subjective experience. Therefore, this challenge aims to enhance the persuasiveness and acceptability of synthesized audio, focusing on human alignment convincing and inspirational audio generation. A total of 19 teams have registered for the challenge, and the results of the competition and the competition are described in this paper. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.12038v2-abstract-full').style.display = 'none'; document.getElementById('2407.12038v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ISCSLP 2024 Challenge description and results</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.11510">arXiv:2407.11510</a> <span> [<a href="https://arxiv.org/pdf/2407.11510">pdf</a>, <a href="https://arxiv.org/format/2407.11510">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> VoxBlink2: A 100K+ Speaker Recognition Corpus and the Open-Set Speaker-Identification Benchmark </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Lin%2C+Y">Yuke Lin</a>, <a href="/search/eess?searchtype=author&query=Cheng%2C+M">Ming Cheng</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+F">Fulin Zhang</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+Y">Yingying Gao</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+S">Shilei Zhang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+M">Ming Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.11510v1-abstract-short" style="display: inline;"> In this paper, we provide a large audio-visual speaker recognition dataset, VoxBlink2, which includes approximately 10M utterances with videos from 110K+ speakers in the wild. This dataset represents a significant expansion over the VoxBlink dataset, encompassing a broader diversity of speakers and scenarios by the grace of an optimized data collection pipeline. Afterward, we explore the impact of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.11510v1-abstract-full').style.display = 'inline'; document.getElementById('2407.11510v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.11510v1-abstract-full" style="display: none;"> In this paper, we provide a large audio-visual speaker recognition dataset, VoxBlink2, which includes approximately 10M utterances with videos from 110K+ speakers in the wild. This dataset represents a significant expansion over the VoxBlink dataset, encompassing a broader diversity of speakers and scenarios by the grace of an optimized data collection pipeline. Afterward, we explore the impact of training strategies, data scale, and model complexity on speaker verification and finally establish a new single-model state-of-the-art EER at 0.170% and minDCF at 0.006% on the VoxCeleb1-O test set. Such remarkable results motivate us to explore speaker recognition from a new challenging perspective. We raise the Open-Set Speaker-Identification task, which is designed to either match a probe utterance with a known gallery speaker or categorize it as an unknown query. Associated with this task, we design concrete benchmark and evaluation protocols. The data and model resources can be found in http://voxblink2.github.io. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.11510v1-abstract-full').style.display = 'none'; document.getElementById('2407.11510v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted By InterSpeech2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.11080">arXiv:2407.11080</a> <span> [<a href="https://arxiv.org/pdf/2407.11080">pdf</a>, <a href="https://arxiv.org/format/2407.11080">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Performance analysis for a rotary compressor at high speed: experimental study and mathematical modeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zheng%2C+C">Chuntai Zheng</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+W">Wei Zhao</a>, <a href="/search/eess?searchtype=author&query=Lyu%2C+B">Benshuai Lyu</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+K">Keke Gao</a>, <a href="/search/eess?searchtype=author&query=Cao%2C+H">Hongjun Cao</a>, <a href="/search/eess?searchtype=author&query=Zhong%2C+L">Lei Zhong</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+Y">Yi Gao</a>, <a href="/search/eess?searchtype=author&query=Liao%2C+R">Ren Liao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.11080v1-abstract-short" style="display: inline;"> This paper conducted a comprehensive study on the performance of a rotary compressor over a rotational speed range of 80Hz to 200Hz through experimental tests and mathematical modeling. A compressor performance test rig was designed to conduct the performance tests, with fast-response pressure sensors and displacement sensors capturing the P-V diagram and dynamic motion of the moving components. R… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.11080v1-abstract-full').style.display = 'inline'; document.getElementById('2407.11080v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.11080v1-abstract-full" style="display: none;"> This paper conducted a comprehensive study on the performance of a rotary compressor over a rotational speed range of 80Hz to 200Hz through experimental tests and mathematical modeling. A compressor performance test rig was designed to conduct the performance tests, with fast-response pressure sensors and displacement sensors capturing the P-V diagram and dynamic motion of the moving components. Results show that the compressor efficiency degrades at high speeds due to the dominant loss factors of leakage and discharge power loss. Supercharging effects become significant at speeds above 160Hz, and its net effects reduce the compressor efficiency, especially at high speeds. This study identifies and analyzes the loss factors on the mass flow rate and power consumption based on experimental data, and hypothesizes possible mechanisms for each loss factor, which can aid in the design of a high-speed rotary compressor with higher efficiency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.11080v1-abstract-full').style.display = 'none'; document.getElementById('2407.11080v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.05984">arXiv:2407.05984</a> <span> [<a href="https://arxiv.org/pdf/2407.05984">pdf</a>, <a href="https://arxiv.org/format/2407.05984">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> MBA-Net: SAM-driven Bidirectional Aggregation Network for Ovarian Tumor Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Gao%2C+Y">Yifan Gao</a>, <a href="/search/eess?searchtype=author&query=Xia%2C+W">Wei Xia</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+W">Wenkui Wang</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+X">Xin Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.05984v1-abstract-short" style="display: inline;"> Accurate segmentation of ovarian tumors from medical images is crucial for early diagnosis, treatment planning, and patient management. However, the diverse morphological characteristics and heterogeneous appearances of ovarian tumors pose significant challenges to automated segmentation methods. In this paper, we propose MBA-Net, a novel architecture that integrates the powerful segmentation capa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.05984v1-abstract-full').style.display = 'inline'; document.getElementById('2407.05984v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.05984v1-abstract-full" style="display: none;"> Accurate segmentation of ovarian tumors from medical images is crucial for early diagnosis, treatment planning, and patient management. However, the diverse morphological characteristics and heterogeneous appearances of ovarian tumors pose significant challenges to automated segmentation methods. In this paper, we propose MBA-Net, a novel architecture that integrates the powerful segmentation capabilities of the Segment Anything Model (SAM) with domain-specific knowledge for accurate and robust ovarian tumor segmentation. MBA-Net employs a hybrid encoder architecture, where the encoder consists of a prior branch, which inherits the SAM encoder to capture robust segmentation priors, and a domain branch, specifically designed to extract domain-specific features. The bidirectional flow of information between the two branches is facilitated by the robust feature injection network (RFIN) and the domain knowledge integration network (DKIN), enabling MBA-Net to leverage the complementary strengths of both branches. We extensively evaluate MBA-Net on the public multi-modality ovarian tumor ultrasound dataset and the in-house multi-site ovarian tumor MRI dataset. Our proposed method consistently outperforms state-of-the-art segmentation approaches. Moreover, MBA-Net demonstrates superior generalization capability across different imaging modalities and clinical sites. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.05984v1-abstract-full').style.display = 'none'; document.getElementById('2407.05984v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">MICCAI 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.05928">arXiv:2407.05928</a> <span> [<a href="https://arxiv.org/pdf/2407.05928">pdf</a>, <a href="https://arxiv.org/format/2407.05928">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> CA-FedRC: Codebook Adaptation via Federated Reservoir Computing in 5G NR </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ye%2C+Z">Ziqiang Ye</a>, <a href="/search/eess?searchtype=author&query=Liao%2C+S">Sikai Liao</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+Y">Yulan Gao</a>, <a href="/search/eess?searchtype=author&query=Fang%2C+S">Shu Fang</a>, <a href="/search/eess?searchtype=author&query=Xiao%2C+Y">Yue Xiao</a>, <a href="/search/eess?searchtype=author&query=Xiao%2C+M">Ming Xiao</a>, <a href="/search/eess?searchtype=author&query=Zammit%2C+S">Saviour Zammit</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.05928v1-abstract-short" style="display: inline;"> With the burgeon deployment of the fifth-generation new radio (5G NR) networks, the codebook plays a crucial role in enabling the base station (BS) to acquire the channel state information (CSI). Different 5G NR codebooks incur varying overheads and exhibit performance disparities under diverse channel conditions, necessitating codebook adaptation based on channel conditions to reduce feedback ove… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.05928v1-abstract-full').style.display = 'inline'; document.getElementById('2407.05928v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.05928v1-abstract-full" style="display: none;"> With the burgeon deployment of the fifth-generation new radio (5G NR) networks, the codebook plays a crucial role in enabling the base station (BS) to acquire the channel state information (CSI). Different 5G NR codebooks incur varying overheads and exhibit performance disparities under diverse channel conditions, necessitating codebook adaptation based on channel conditions to reduce feedback overhead while enhancing performance. However, existing methods of 5G NR codebooks adaptation require significant overhead for model training and feedback or fall short in performance. To address these limitations, this letter introduces a federated reservoir computing framework designed for efficient codebook adaptation in computationally and feedback resource-constrained mobile devices. This framework utilizes a novel series of indicators as input training data, striking an effective balance between performance and feedback overhead. Compared to conventional models, the proposed codebook adaptation via federated reservoir computing (CA-FedRC), achieves rapid convergence and significant loss reduction in both speed and accuracy. Extensive simulations under various channel conditions demonstrate that our algorithm not only reduces resource consumption of users but also accurately identifies channel types, thereby optimizing the trade-off between spectrum efficiency, computational complexity, and feedback overhead. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.05928v1-abstract-full').style.display = 'none'; document.getElementById('2407.05928v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.05571">arXiv:2407.05571</a> <span> [<a href="https://arxiv.org/pdf/2407.05571">pdf</a>, <a href="https://arxiv.org/format/2407.05571">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Cost-Efficient Computation Offloading in SAGIN: A Deep Reinforcement Learning and Perception-Aided Approach </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Gao%2C+Y">Yulan Gao</a>, <a href="/search/eess?searchtype=author&query=Ye%2C+Z">Ziqiang Ye</a>, <a href="/search/eess?searchtype=author&query=Yu%2C+H">Han Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.05571v1-abstract-short" style="display: inline;"> The Space-Air-Ground Integrated Network (SAGIN), crucial to the advancement of sixth-generation (6G) technology, plays a key role in ensuring universal connectivity, particularly by addressing the communication needs of remote areas lacking cellular network infrastructure. This paper delves into the role of unmanned aerial vehicles (UAVs) within SAGIN, where they act as a control layer owing to th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.05571v1-abstract-full').style.display = 'inline'; document.getElementById('2407.05571v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.05571v1-abstract-full" style="display: none;"> The Space-Air-Ground Integrated Network (SAGIN), crucial to the advancement of sixth-generation (6G) technology, plays a key role in ensuring universal connectivity, particularly by addressing the communication needs of remote areas lacking cellular network infrastructure. This paper delves into the role of unmanned aerial vehicles (UAVs) within SAGIN, where they act as a control layer owing to their adaptable deployment capabilities and their intermediary role. Equipped with millimeter-wave (mmWave) radar and vision sensors, these UAVs are capable of acquiring multi-source data, which helps to diminish uncertainty and enhance the accuracy of decision-making. Concurrently, UAVs collect tasks requiring computing resources from their coverage areas, originating from a variety of mobile devices moving at different speeds. These tasks are then allocated to ground base stations (BSs), low-earth-orbit (LEO) satellite, and local processing units to improve processing efficiency. Amidst this framework, our study concentrates on devising dynamic strategies for facilitating task hosting between mobile devices and UAVs, offloading computations, managing associations between UAVs and BSs, and allocating computing resources. The objective is to minimize the time-averaged network cost, considering the uncertainty of device locations, speeds, and even types. To tackle these complexities, we propose a deep reinforcement learning and perception-aided online approach (DRL-and-Perception-aided Approach) for this joint optimization in SAGIN, tailored for an environment filled with uncertainties. The effectiveness of our proposed approach is validated through extensive numerical simulations, which quantify its performance relative to various network parameters. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.05571v1-abstract-full').style.display = 'none'; document.getElementById('2407.05571v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.02911">arXiv:2407.02911</a> <span> [<a href="https://arxiv.org/pdf/2407.02911">pdf</a>, <a href="https://arxiv.org/format/2407.02911">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Non-Adversarial Learning: Vector-Quantized Common Latent Space for Multi-Sequence MRI </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Han%2C+L">Luyi Han</a>, <a href="/search/eess?searchtype=author&query=Tan%2C+T">Tao Tan</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+T">Tianyu Zhang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+X">Xin Wang</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+Y">Yuan Gao</a>, <a href="/search/eess?searchtype=author&query=Lu%2C+C">Chunyao Lu</a>, <a href="/search/eess?searchtype=author&query=Liang%2C+X">Xinglong Liang</a>, <a href="/search/eess?searchtype=author&query=Dou%2C+H">Haoran Dou</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+Y">Yunzhi Huang</a>, <a href="/search/eess?searchtype=author&query=Mann%2C+R">Ritse Mann</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.02911v1-abstract-short" style="display: inline;"> Adversarial learning helps generative models translate MRI from source to target sequence when lacking paired samples. However, implementing MRI synthesis with adversarial learning in clinical settings is challenging due to training instability and mode collapse. To address this issue, we leverage intermediate sequences to estimate the common latent space among multi-sequence MRI, enabling the rec… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.02911v1-abstract-full').style.display = 'inline'; document.getElementById('2407.02911v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.02911v1-abstract-full" style="display: none;"> Adversarial learning helps generative models translate MRI from source to target sequence when lacking paired samples. However, implementing MRI synthesis with adversarial learning in clinical settings is challenging due to training instability and mode collapse. To address this issue, we leverage intermediate sequences to estimate the common latent space among multi-sequence MRI, enabling the reconstruction of distinct sequences from the common latent space. We propose a generative model that compresses discrete representations of each sequence to estimate the Gaussian distribution of vector-quantized common (VQC) latent space between multiple sequences. Moreover, we improve the latent space consistency with contrastive learning and increase model stability by domain augmentation. Experiments using BraTS2021 dataset show that our non-adversarial model outperforms other GAN-based methods, and VQC latent space aids our model to achieve (1) anti-interference ability, which can eliminate the effects of noise, bias fields, and artifacts, and (2) solid semantic representation ability, with the potential of one-shot segmentation. Our code is publicly available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.02911v1-abstract-full').style.display = 'none'; document.getElementById('2407.02911v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.00949">arXiv:2407.00949</a> <span> [<a href="https://arxiv.org/pdf/2407.00949">pdf</a>, <a href="https://arxiv.org/ps/2407.00949">ps</a>, <a href="https://arxiv.org/format/2407.00949">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> SpectralKAN: Kolmogorov-Arnold Network for Hyperspectral Images Change Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yanheng Wang</a>, <a href="/search/eess?searchtype=author&query=Yu%2C+X">Xiaohan Yu</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+Y">Yongsheng Gao</a>, <a href="/search/eess?searchtype=author&query=Sha%2C+J">Jianjun Sha</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+J">Jian Wang</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+L">Lianru Gao</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Y">Yonggang Zhang</a>, <a href="/search/eess?searchtype=author&query=Rong%2C+X">Xianhui Rong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.00949v1-abstract-short" style="display: inline;"> It has been verified that deep learning methods, including convolutional neural networks (CNNs), graph neural networks (GNNs), and transformers, can accurately extract features from hyperspectral images (HSIs). These algorithms perform exceptionally well on HSIs change detection (HSIs-CD). However, the downside of these impressive results is the enormous number of parameters, FLOPs, GPU memory, tr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.00949v1-abstract-full').style.display = 'inline'; document.getElementById('2407.00949v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.00949v1-abstract-full" style="display: none;"> It has been verified that deep learning methods, including convolutional neural networks (CNNs), graph neural networks (GNNs), and transformers, can accurately extract features from hyperspectral images (HSIs). These algorithms perform exceptionally well on HSIs change detection (HSIs-CD). However, the downside of these impressive results is the enormous number of parameters, FLOPs, GPU memory, training and test times required. In this paper, we propose an spectral Kolmogorov-Arnold Network for HSIs-CD (SpectralKAN). SpectralKAN represent a multivariate continuous function with a composition of activation functions to extract HSIs feature and classification. These activation functions are b-spline functions with different parameters that can simulate various functions. In SpectralKAN, a KAN encoder is proposed to enhance computational efficiency for HSIs. And a spatial-spectral KAN encoder is introduced, where the spatial KAN encoder extracts spatial features and compresses the spatial dimensions from patch size to one. The spectral KAN encoder then extracts spectral features and classifies them into changed and unchanged categories. We use five HSIs-CD datasets to verify the effectiveness of SpectralKAN. Experimental verification has shown that SpectralKAN maintains high HSIs-CD accuracy while requiring fewer parameters, FLOPs, GPU memory, training and testing times, thereby increasing the efficiency of HSIs-CD. The code will be available at https://github.com/yanhengwang-heu/SpectralKAN. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.00949v1-abstract-full').style.display = 'none'; document.getElementById('2407.00949v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.18067">arXiv:2406.18067</a> <span> [<a href="https://arxiv.org/pdf/2406.18067">pdf</a>, <a href="https://arxiv.org/format/2406.18067">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Exploring Energy-Based Models for Out-of-Distribution Detection in Dialect Identification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Hao%2C+Y">Yaqian Hao</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+C">Chenguang Hu</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+Y">Yingying Gao</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+S">Shilei Zhang</a>, <a href="/search/eess?searchtype=author&query=Feng%2C+J">Junlan Feng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.18067v1-abstract-short" style="display: inline;"> The diverse nature of dialects presents challenges for models trained on specific linguistic patterns, rendering them susceptible to errors when confronted with unseen or out-of-distribution (OOD) data. This study introduces a novel margin-enhanced joint energy model (MEJEM) tailored specifically for OOD detection in dialects. By integrating a generative model and the energy margin loss, our appro… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.18067v1-abstract-full').style.display = 'inline'; document.getElementById('2406.18067v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.18067v1-abstract-full" style="display: none;"> The diverse nature of dialects presents challenges for models trained on specific linguistic patterns, rendering them susceptible to errors when confronted with unseen or out-of-distribution (OOD) data. This study introduces a novel margin-enhanced joint energy model (MEJEM) tailored specifically for OOD detection in dialects. By integrating a generative model and the energy margin loss, our approach aims to enhance the robustness of dialect identification systems. Furthermore, we explore two OOD scores for OOD dialect detection, and our findings conclusively demonstrate that the energy score outperforms the softmax score. Leveraging Sharpness-Aware Minimization to optimize the training process of the joint model, we enhance model generalization by minimizing both loss and sharpness. Experiments conducted on dialect identification tasks validate the efficacy of Energy-Based Models and provide valuable insights into their performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.18067v1-abstract-full').style.display = 'none'; document.getElementById('2406.18067v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.18065">arXiv:2406.18065</a> <span> [<a href="https://arxiv.org/pdf/2406.18065">pdf</a>, <a href="https://arxiv.org/format/2406.18065">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> On Calibration of Speech Classification Models: Insights from Energy-Based Model Investigations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Hao%2C+Y">Yaqian Hao</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+C">Chenguang Hu</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+Y">Yingying Gao</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+S">Shilei Zhang</a>, <a href="/search/eess?searchtype=author&query=Feng%2C+J">Junlan Feng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.18065v1-abstract-short" style="display: inline;"> For speech classification tasks, deep learning models often achieve high accuracy but exhibit shortcomings in calibration, manifesting as classifiers exhibiting overconfidence. The significance of calibration lies in its critical role in guaranteeing the reliability of decision-making within deep learning systems. This study explores the effectiveness of Energy-Based Models in calibrating confiden… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.18065v1-abstract-full').style.display = 'inline'; document.getElementById('2406.18065v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.18065v1-abstract-full" style="display: none;"> For speech classification tasks, deep learning models often achieve high accuracy but exhibit shortcomings in calibration, manifesting as classifiers exhibiting overconfidence. The significance of calibration lies in its critical role in guaranteeing the reliability of decision-making within deep learning systems. This study explores the effectiveness of Energy-Based Models in calibrating confidence for speech classification tasks by training a joint EBM integrating a discriminative and a generative model, thereby enhancing the classifiers calibration and mitigating overconfidence. Experimental evaluations conducted on three speech classification tasks specifically: age, emotion, and language recognition. Our findings highlight the competitive performance of EBMs in calibrating the speech classification models. This research emphasizes the potential of EBMs in speech classification tasks, demonstrating their ability to enhance calibration without sacrificing accuracy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.18065v1-abstract-full').style.display = 'none'; document.getElementById('2406.18065v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.14264">arXiv:2406.14264</a> <span> [<a href="https://arxiv.org/pdf/2406.14264">pdf</a>, <a href="https://arxiv.org/format/2406.14264">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TCI.2024.3458411">10.1109/TCI.2024.3458411 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Zero-Shot Image Denoising for High-Resolution Electron Microscopy </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Tian%2C+X">Xuanyu Tian</a>, <a href="/search/eess?searchtype=author&query=Dong%2C+Z">Zhuoya Dong</a>, <a href="/search/eess?searchtype=author&query=Lin%2C+X">Xiyue Lin</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+Y">Yue Gao</a>, <a href="/search/eess?searchtype=author&query=Wei%2C+H">Hongjiang Wei</a>, <a href="/search/eess?searchtype=author&query=Ma%2C+Y">Yanhang Ma</a>, <a href="/search/eess?searchtype=author&query=Yu%2C+J">Jingyi Yu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Y">Yuyao Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.14264v2-abstract-short" style="display: inline;"> High-resolution electron microscopy (HREM) imaging technique is a powerful tool for directly visualizing a broad range of materials in real-space. However, it faces challenges in denoising due to ultra-low signal-to-noise ratio (SNR) and scarce data availability. In this work, we propose Noise2SR, a zero-shot self-supervised learning (ZS-SSL) denoising framework for HREM. Within our framework, we… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.14264v2-abstract-full').style.display = 'inline'; document.getElementById('2406.14264v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.14264v2-abstract-full" style="display: none;"> High-resolution electron microscopy (HREM) imaging technique is a powerful tool for directly visualizing a broad range of materials in real-space. However, it faces challenges in denoising due to ultra-low signal-to-noise ratio (SNR) and scarce data availability. In this work, we propose Noise2SR, a zero-shot self-supervised learning (ZS-SSL) denoising framework for HREM. Within our framework, we propose a super-resolution (SR) based self-supervised training strategy, incorporating the Random Sub-sampler module. The Random Sub-sampler is designed to generate approximate infinite noisy pairs from a single noisy image, serving as an effective data augmentation in zero-shot denoising. Noise2SR trains the network with paired noisy images of different resolutions, which is conducted via SR strategy. The SR-based training facilitates the network adopting more pixels for supervision, and the random sub-sampling helps compel the network to learn continuous signals enhancing the robustness. Meanwhile, we mitigate the uncertainty caused by random-sampling by adopting minimum mean squared error (MMSE) estimation for the denoised results. With the distinctive integration of training strategy and proposed designs, Noise2SR can achieve superior denoising performance using a single noisy HREM image. We evaluate the performance of Noise2SR in both simulated and real HREM denoising tasks. It outperforms state-of-the-art ZS-SSL methods and achieves comparable denoising performance with supervised methods. The success of Noise2SR suggests its potential for improving the SNR of images in material imaging domains. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.14264v2-abstract-full').style.display = 'none'; document.getElementById('2406.14264v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 12 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> IEEE Transactions on Computational Imaging 10,(2024),1462 - 1475 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.13268">arXiv:2406.13268</a> <span> [<a href="https://arxiv.org/pdf/2406.13268">pdf</a>, <a href="https://arxiv.org/format/2406.13268">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> CEC: A Noisy Label Detection Method for Speaker Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Shen%2C+Y">Yao Shen</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+Y">Yingying Gao</a>, <a href="/search/eess?searchtype=author&query=Hao%2C+Y">Yaqian Hao</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+C">Chenguang Hu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+F">Fulin Zhang</a>, <a href="/search/eess?searchtype=author&query=Feng%2C+J">Junlan Feng</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+S">Shilei Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.13268v1-abstract-short" style="display: inline;"> Noisy labels are inevitable, even in well-annotated datasets. The detection of noisy labels is of significant importance to enhance the robustness of speaker recognition models. In this paper, we propose a novel noisy label detection approach based on two new statistical metrics: Continuous Inconsistent Counting (CIC) and Total Inconsistent Counting (TIC). These metrics are calculated through Cros… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.13268v1-abstract-full').style.display = 'inline'; document.getElementById('2406.13268v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.13268v1-abstract-full" style="display: none;"> Noisy labels are inevitable, even in well-annotated datasets. The detection of noisy labels is of significant importance to enhance the robustness of speaker recognition models. In this paper, we propose a novel noisy label detection approach based on two new statistical metrics: Continuous Inconsistent Counting (CIC) and Total Inconsistent Counting (TIC). These metrics are calculated through Cross-Epoch Counting (CEC) and correspond to the early and late stages of training, respectively. Additionally, we categorize samples based on their prediction results into three categories: inconsistent samples, hard samples, and easy samples. During training, we gradually increase the difficulty of hard samples to update model parameters, preventing noisy labels from being overfitted. Compared to contrastive schemes, our approach not only achieves the best performance in speaker verification but also excels in noisy label detection. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.13268v1-abstract-full').style.display = 'none'; document.getElementById('2406.13268v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">interspeech 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.13145">arXiv:2406.13145</a> <span> [<a href="https://arxiv.org/pdf/2406.13145">pdf</a>, <a href="https://arxiv.org/format/2406.13145">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Constructing and Evaluating Digital Twins: An Intelligent Framework for DT Development </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ma%2C+L">Longfei Ma</a>, <a href="/search/eess?searchtype=author&query=Cheng%2C+N">Nan Cheng</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+X">Xiucheng Wang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+J">Jiong Chen</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+Y">Yinjun Gao</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+D">Dongxiao Zhang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+J">Jun-Jie Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.13145v1-abstract-short" style="display: inline;"> The development of Digital Twins (DTs) represents a transformative advance for simulating and optimizing complex systems in a controlled digital space. Despite their potential, the challenge of constructing DTs that accurately replicate and predict the dynamics of real-world systems remains substantial. This paper introduces an intelligent framework for the construction and evaluation of DTs, spec… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.13145v1-abstract-full').style.display = 'inline'; document.getElementById('2406.13145v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.13145v1-abstract-full" style="display: none;"> The development of Digital Twins (DTs) represents a transformative advance for simulating and optimizing complex systems in a controlled digital space. Despite their potential, the challenge of constructing DTs that accurately replicate and predict the dynamics of real-world systems remains substantial. This paper introduces an intelligent framework for the construction and evaluation of DTs, specifically designed to enhance the accuracy and utility of DTs in testing algorithmic performance. We propose a novel construction methodology that integrates deep learning-based policy gradient techniques to dynamically tune the DT parameters, ensuring high fidelity in the digital replication of physical systems. Moreover, the Mean STate Error (MSTE) is proposed as a robust metric for evaluating the performance of algorithms within these digital space. The efficacy of our framework is demonstrated through extensive simulations that show our DT not only accurately mirrors the physical reality but also provides a reliable platform for algorithm evaluation. This work lays a foundation for future research into DT technologies, highlighting pathways for both theoretical enhancements and practical implementations in various industries. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.13145v1-abstract-full').style.display = 'none'; document.getElementById('2406.13145v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.12300">arXiv:2406.12300</a> <span> [<a href="https://arxiv.org/pdf/2406.12300">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neurons and Cognition">q-bio.NC</span> </div> </div> <p class="title is-5 mathjax"> IR2QSM: Quantitative Susceptibility Mapping via Deep Neural Networks with Iterative Reverse Concatenations and Recurrent Modules </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+M">Min Li</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+C">Chen Chen</a>, <a href="/search/eess?searchtype=author&query=Xiong%2C+Z">Zhuang Xiong</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+Y">Ying Liu</a>, <a href="/search/eess?searchtype=author&query=Rong%2C+P">Pengfei Rong</a>, <a href="/search/eess?searchtype=author&query=Shan%2C+S">Shanshan Shan</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+F">Feng Liu</a>, <a href="/search/eess?searchtype=author&query=Sun%2C+H">Hongfu Sun</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+Y">Yang Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.12300v1-abstract-short" style="display: inline;"> Quantitative susceptibility mapping (QSM) is an MRI phase-based post-processing technique to extract the distribution of tissue susceptibilities, demonstrating significant potential in studying neurological diseases. However, the ill-conditioned nature of dipole inversion makes QSM reconstruction from the tissue field prone to noise and artifacts. In this work, we propose a novel deep learning-bas… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.12300v1-abstract-full').style.display = 'inline'; document.getElementById('2406.12300v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.12300v1-abstract-full" style="display: none;"> Quantitative susceptibility mapping (QSM) is an MRI phase-based post-processing technique to extract the distribution of tissue susceptibilities, demonstrating significant potential in studying neurological diseases. However, the ill-conditioned nature of dipole inversion makes QSM reconstruction from the tissue field prone to noise and artifacts. In this work, we propose a novel deep learning-based IR2QSM method for QSM reconstruction. It is designed by iterating four times of a reverse concatenations and middle recurrent modules enhanced U-net, which could dramatically improve the efficiency of latent feature utilization. Simulated and in vivo experiments were conducted to compare IR2QSM with several traditional algorithms (MEDI and iLSQR) and state-of-the-art deep learning methods (U-net, xQSM, and LPCNN). The results indicated that IR2QSM was able to obtain QSM images with significantly increased accuracy and mitigated artifacts over other methods. Particularly, IR2QSM demonstrated on average the best NRMSE (27.59%) in simulated experiments, which is 15.48%, 7.86%, 17.24%, 9.26%, and 29.13% lower than iLSQR, MEDI, U-net, xQSM, LPCNN, respectively, and led to improved QSM results with fewer artifacts for the in vivo data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.12300v1-abstract-full').style.display = 'none'; document.getElementById('2406.12300v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 9 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.10856">arXiv:2406.10856</a> <span> [<a href="https://arxiv.org/pdf/2406.10856">pdf</a>, <a href="https://arxiv.org/format/2406.10856">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> LEO Satellite Networks Assisted Geo-distributed Data Processing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhao%2C+Z">Zhiyuan Zhao</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Z">Zhe Chen</a>, <a href="/search/eess?searchtype=author&query=Lin%2C+Z">Zheng Lin</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+W">Wenjun Zhu</a>, <a href="/search/eess?searchtype=author&query=Qiu%2C+K">Kun Qiu</a>, <a href="/search/eess?searchtype=author&query=You%2C+C">Chaoqun You</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+Y">Yue Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.10856v1-abstract-short" style="display: inline;"> Nowadays, the increasing deployment of edge clouds globally provides users with low-latency services. However, connecting an edge cloud to a core cloud via optic cables in terrestrial networks poses significant barriers due to the prohibitively expensive building cost of optic cables. Fortunately, emerging Low Earth Orbit (LEO) satellite networks (e.g., Starlink) offer a more cost-effective soluti… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.10856v1-abstract-full').style.display = 'inline'; document.getElementById('2406.10856v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.10856v1-abstract-full" style="display: none;"> Nowadays, the increasing deployment of edge clouds globally provides users with low-latency services. However, connecting an edge cloud to a core cloud via optic cables in terrestrial networks poses significant barriers due to the prohibitively expensive building cost of optic cables. Fortunately, emerging Low Earth Orbit (LEO) satellite networks (e.g., Starlink) offer a more cost-effective solution for increasing edge clouds, and hence large volumes of data in edge clouds can be transferred to a core cloud via those networks for time-sensitive big data tasks processing, such as attack detection. However, the state-of-the-art satellite selection algorithms bring poor performance for those processing via our measurements. Therefore, we propose a novel data volume aware satellite selection algorithm, named DVA, to support such big data processing tasks. DVA first takes into account both the data size in edge clouds and satellite capacity to finalize the selection, thereby preventing congestion in the access network and reducing transmitting duration. Extensive simulations validate that DVA has a significantly lower average access network duration than the state-of-the-art satellite selection algorithms in a LEO satellite emulation platform. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.10856v1-abstract-full').style.display = 'none'; document.getElementById('2406.10856v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">6 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.09931">arXiv:2406.09931</a> <span> [<a href="https://arxiv.org/pdf/2406.09931">pdf</a>, <a href="https://arxiv.org/format/2406.09931">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> SCKansformer: Fine-Grained Classification of Bone Marrow Cells via Kansformer Backbone and Hierarchical Attention Mechanisms </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yifei Chen</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+Z">Zhu Zhu</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+S">Shenghao Zhu</a>, <a href="/search/eess?searchtype=author&query=Qiu%2C+L">Linwei Qiu</a>, <a href="/search/eess?searchtype=author&query=Zou%2C+B">Binfeng Zou</a>, <a href="/search/eess?searchtype=author&query=Jia%2C+F">Fan Jia</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+Y">Yunpeng Zhu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+C">Chenyan Zhang</a>, <a href="/search/eess?searchtype=author&query=Fang%2C+Z">Zhaojie Fang</a>, <a href="/search/eess?searchtype=author&query=Qin%2C+F">Feiwei Qin</a>, <a href="/search/eess?searchtype=author&query=Fan%2C+J">Jin Fan</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+C">Changmiao Wang</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+Y">Yu Gao</a>, <a href="/search/eess?searchtype=author&query=Yu%2C+G">Gang Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.09931v3-abstract-short" style="display: inline;"> The incidence and mortality rates of malignant tumors, such as acute leukemia, have risen significantly. Clinically, hospitals rely on cytological examination of peripheral blood and bone marrow smears to diagnose malignant tumors, with accurate blood cell counting being crucial. Existing automated methods face challenges such as low feature expression capability, poor interpretability, and redund… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.09931v3-abstract-full').style.display = 'inline'; document.getElementById('2406.09931v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.09931v3-abstract-full" style="display: none;"> The incidence and mortality rates of malignant tumors, such as acute leukemia, have risen significantly. Clinically, hospitals rely on cytological examination of peripheral blood and bone marrow smears to diagnose malignant tumors, with accurate blood cell counting being crucial. Existing automated methods face challenges such as low feature expression capability, poor interpretability, and redundant feature extraction when processing high-dimensional microimage data. We propose a novel fine-grained classification model, SCKansformer, for bone marrow blood cells, which addresses these challenges and enhances classification accuracy and efficiency. The model integrates the Kansformer Encoder, SCConv Encoder, and Global-Local Attention Encoder. The Kansformer Encoder replaces the traditional MLP layer with the KAN, improving nonlinear feature representation and interpretability. The SCConv Encoder, with its Spatial and Channel Reconstruction Units, enhances feature representation and reduces redundancy. The Global-Local Attention Encoder combines Multi-head Self-Attention with a Local Part module to capture both global and local features. We validated our model using the Bone Marrow Blood Cell Fine-Grained Classification Dataset (BMCD-FGCD), comprising over 10,000 samples and nearly 40 classifications, developed with a partner hospital. Comparative experiments on our private dataset, as well as the publicly available PBC and ALL-IDB datasets, demonstrate that SCKansformer outperforms both typical and advanced microcell classification methods across all datasets. Our source code and private BMCD-FGCD dataset are available at https://github.com/JustlfC03/SCKansformer. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.09931v3-abstract-full').style.display = 'none'; document.getElementById('2406.09931v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages, 6 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> IEEE Journal of Biomedical and Health Informatics 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.09444">arXiv:2406.09444</a> <span> [<a href="https://arxiv.org/pdf/2406.09444">pdf</a>, <a href="https://arxiv.org/format/2406.09444">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> GenDistiller: Distilling Pre-trained Language Models based on an Autoregressive Generative Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Gao%2C+Y">Yingying Gao</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+S">Shilei Zhang</a>, <a href="/search/eess?searchtype=author&query=Deng%2C+C">Chao Deng</a>, <a href="/search/eess?searchtype=author&query=Feng%2C+J">Junlan Feng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.09444v2-abstract-short" style="display: inline;"> Pre-trained speech language models such as HuBERT and WavLM leverage unlabeled speech data for self-supervised learning and offer powerful representations for numerous downstream tasks. Despite the success of these models, their high requirements for memory and computing resource hinder their application on resource restricted devices. Therefore, this paper introduces GenDistiller, a novel knowled… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.09444v2-abstract-full').style.display = 'inline'; document.getElementById('2406.09444v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.09444v2-abstract-full" style="display: none;"> Pre-trained speech language models such as HuBERT and WavLM leverage unlabeled speech data for self-supervised learning and offer powerful representations for numerous downstream tasks. Despite the success of these models, their high requirements for memory and computing resource hinder their application on resource restricted devices. Therefore, this paper introduces GenDistiller, a novel knowledge distillation framework which generates the hidden representations of the pre-trained teacher model directly by a much smaller student network. The proposed method takes the previous hidden layer as history and implements a layer-by-layer prediction of the teacher model autoregressively. Experiments on SUPERB reveal the advantage of GenDistiller over the baseline distilling method without an autoregressive framework, with 33% fewer parameters, similar time consumption and better performance on most of the SUPERB tasks. Ultimately, the proposed GenDistiller reduces the size of WavLM by 82%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.09444v2-abstract-full').style.display = 'none'; document.getElementById('2406.09444v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">arXiv admin note: text overlap with arXiv:2310.13418</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.07801">arXiv:2406.07801</a> <span> [<a href="https://arxiv.org/pdf/2406.07801">pdf</a>, <a href="https://arxiv.org/format/2406.07801">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> PolySpeech: Exploring Unified Multitask Speech Models for Competitiveness with Single-task Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yang%2C+R">Runyan Yang</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+H">Huibao Yang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+X">Xiqing Zhang</a>, <a href="/search/eess?searchtype=author&query=Ye%2C+T">Tiantian Ye</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+Y">Ying Liu</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+Y">Yingying Gao</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+S">Shilei Zhang</a>, <a href="/search/eess?searchtype=author&query=Deng%2C+C">Chao Deng</a>, <a href="/search/eess?searchtype=author&query=Feng%2C+J">Junlan Feng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.07801v1-abstract-short" style="display: inline;"> Recently, there have been attempts to integrate various speech processing tasks into a unified model. However, few previous works directly demonstrated that joint optimization of diverse tasks in multitask speech models has positive influence on the performance of individual tasks. In this paper we present a multitask speech model -- PolySpeech, which supports speech recognition, speech synthesis,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.07801v1-abstract-full').style.display = 'inline'; document.getElementById('2406.07801v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.07801v1-abstract-full" style="display: none;"> Recently, there have been attempts to integrate various speech processing tasks into a unified model. However, few previous works directly demonstrated that joint optimization of diverse tasks in multitask speech models has positive influence on the performance of individual tasks. In this paper we present a multitask speech model -- PolySpeech, which supports speech recognition, speech synthesis, and two speech classification tasks. PolySpeech takes multi-modal language model as its core structure and uses semantic representations as speech inputs. We introduce semantic speech embedding tokenization and speech reconstruction methods to PolySpeech, enabling efficient generation of high-quality speech for any given speaker. PolySpeech shows competitiveness across various tasks compared to single-task models. In our experiments, multitask optimization achieves performance comparable to single-task optimization and is especially beneficial for specific tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.07801v1-abstract-full').style.display = 'none'; document.getElementById('2406.07801v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 2 figures</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Gao%2C+Y&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Gao%2C+Y&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Gao%2C+Y&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Gao%2C+Y&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Gao%2C+Y&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Gao%2C+Y&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li> <a href="/search/?searchtype=author&query=Gao%2C+Y&start=250" class="pagination-link " aria-label="Page 6" aria-current="page">6 </a> </li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository