CINXE.COM
Search | arXiv e-print repository
<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 1,294 results for author: <span class="mathjax">Chen, Y</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> <div class="content"> <form method="GET" action="/search/eess" aria-role="search"> Searching in archive <strong>eess</strong>. <a href="/search/?searchtype=author&query=Chen%2C+Y">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Chen, Y"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Chen%2C+Y&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Chen, Y"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Chen%2C+Y&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Chen%2C+Y&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Chen%2C+Y&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Chen%2C+Y&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Chen%2C+Y&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Chen%2C+Y&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.12355">arXiv:2502.12355</a> <span> [<a href="https://arxiv.org/pdf/2502.12355">pdf</a>, <a href="https://arxiv.org/format/2502.12355">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Hovering Flight of Soft-Actuated Insect-Scale Micro Aerial Vehicles using Deep Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Hsiao%2C+Y">Yi-Hsuan Hsiao</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+W">Wei-Tung Chen</a>, <a href="/search/eess?searchtype=author&query=Chang%2C+Y">Yun-Sheng Chang</a>, <a href="/search/eess?searchtype=author&query=Agrawal%2C+P">Pulkit Agrawal</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">YuFeng Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.12355v1-abstract-short" style="display: inline;"> Soft-actuated insect-scale micro aerial vehicles (IMAVs) pose unique challenges for designing robust and computationally efficient controllers. At the millimeter scale, fast robot dynamics ($\sim$ms), together with system delay, model uncertainty, and external disturbances significantly affect flight performances. Here, we design a deep reinforcement learning (RL) controller that addresses system… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12355v1-abstract-full').style.display = 'inline'; document.getElementById('2502.12355v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.12355v1-abstract-full" style="display: none;"> Soft-actuated insect-scale micro aerial vehicles (IMAVs) pose unique challenges for designing robust and computationally efficient controllers. At the millimeter scale, fast robot dynamics ($\sim$ms), together with system delay, model uncertainty, and external disturbances significantly affect flight performances. Here, we design a deep reinforcement learning (RL) controller that addresses system delay and uncertainties. To initialize this neural network (NN) controller, we propose a modified behavior cloning (BC) approach with state-action re-matching to account for delay and domain-randomized expert demonstration to tackle uncertainty. Then we apply proximal policy optimization (PPO) to fine-tune the policy during RL, enhancing performance and smoothing commands. In simulations, our modified BC substantially increases the mean reward compared to baseline BC; and RL with PPO improves flight quality and reduces command fluctuations. We deploy this controller on two different insect-scale aerial robots that weigh 720 mg and 850 mg, respectively. The robots demonstrate multiple successful zero-shot hovering flights, with the longest lasting 50 seconds and root-mean-square errors of 1.34 cm in lateral direction and 0.05 cm in altitude, marking the first end-to-end deep RL-based flight on soft-driven IMAVs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12355v1-abstract-full').style.display = 'none'; document.getElementById('2502.12355v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages, 7 figures, accepted to 2025 IEEE International Conference on Soft Robotics (RoboSoft)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.10932">arXiv:2502.10932</a> <span> [<a href="https://arxiv.org/pdf/2502.10932">pdf</a>, <a href="https://arxiv.org/format/2502.10932">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> PPAC Driven Multi-die and Multi-technology Floorplanning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Roman-Vicharra%2C+C">Cristhian Roman-Vicharra</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yiran Chen</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+J">Jiang Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.10932v1-abstract-short" style="display: inline;"> In heterogeneous integration, where different dies may utilize distinct technologies, floorplanning across multiple dies inherently requires simultaneous technology selection. This work presents the first systematic study of multi-die and multi-technology floorplanning. Unlike many conventional approaches, which are primarily driven by area and wirelength, this study additionally considers perform… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10932v1-abstract-full').style.display = 'inline'; document.getElementById('2502.10932v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.10932v1-abstract-full" style="display: none;"> In heterogeneous integration, where different dies may utilize distinct technologies, floorplanning across multiple dies inherently requires simultaneous technology selection. This work presents the first systematic study of multi-die and multi-technology floorplanning. Unlike many conventional approaches, which are primarily driven by area and wirelength, this study additionally considers performance, power, and cost, highlighting the impact of technology selection. A simulated annealing method and a reinforcement learning techniques are developed. Experimental results show that the proposed techniques significantly outperform a naive baseline approach. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10932v1-abstract-full').style.display = 'none'; document.getElementById('2502.10932v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.10580">arXiv:2502.10580</a> <span> [<a href="https://arxiv.org/pdf/2502.10580">pdf</a>, <a href="https://arxiv.org/format/2502.10580">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Medical Physics">physics.med-ph</span> </div> </div> <p class="title is-5 mathjax"> Accelerating Quantitative MRI using Subspace Multiscale Energy Model (SS-MuSE) </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yan Chen</a>, <a href="/search/eess?searchtype=author&query=Chand%2C+J+R">Jyothi Rikhab Chand</a>, <a href="/search/eess?searchtype=author&query=Kecskemeti%2C+S+R">Steven R. Kecskemeti</a>, <a href="/search/eess?searchtype=author&query=Holmes%2C+J+H">James H. Holmes</a>, <a href="/search/eess?searchtype=author&query=Jacob%2C+M">Mathews Jacob</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.10580v1-abstract-short" style="display: inline;"> Multi-contrast MRI methods acquire multiple images with different contrast weightings, which are used for the differentiation of the tissue types or quantitative mapping. However, the scan time needed to acquire multiple contrasts is prohibitively long for 3D acquisition schemes, which can offer isotropic image resolution. While deep learning-based methods have been extensively used to accelerate… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10580v1-abstract-full').style.display = 'inline'; document.getElementById('2502.10580v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.10580v1-abstract-full" style="display: none;"> Multi-contrast MRI methods acquire multiple images with different contrast weightings, which are used for the differentiation of the tissue types or quantitative mapping. However, the scan time needed to acquire multiple contrasts is prohibitively long for 3D acquisition schemes, which can offer isotropic image resolution. While deep learning-based methods have been extensively used to accelerate 2D and 2D + time problems, the high memory demand, computation time, and need for large training data sets make them challenging for large-scale volumes. To address these challenges, we generalize the plug-and-play multi-scale energy-based model (MuSE) to a regularized subspace recovery setting, where we jointly regularize the 3D multi-contrast spatial factors in a subspace formulation. The explicit energy-based formulation allows us to use variable splitting optimization methods for computationally efficient recovery. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10580v1-abstract-full').style.display = 'none'; document.getElementById('2502.10580v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.10465">arXiv:2502.10465</a> <span> [<a href="https://arxiv.org/pdf/2502.10465">pdf</a>, <a href="https://arxiv.org/format/2502.10465">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Image Watermarking of Generative Diffusion Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yunzhuo Chen</a>, <a href="/search/eess?searchtype=author&query=Vice%2C+J">Jordan Vice</a>, <a href="/search/eess?searchtype=author&query=Akhtar%2C+N">Naveed Akhtar</a>, <a href="/search/eess?searchtype=author&query=Haldar%2C+N+A+H">Nur Al Hasan Haldar</a>, <a href="/search/eess?searchtype=author&query=Mian%2C+A">Ajmal Mian</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.10465v1-abstract-short" style="display: inline;"> Embedding watermarks into the output of generative models is essential for establishing copyright and verifiable ownership over the generated content. Emerging diffusion model watermarking methods either embed watermarks in the frequency domain or offer limited versatility of the watermark patterns in the image space, which allows simplistic detection and removal of the watermarks from the generat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10465v1-abstract-full').style.display = 'inline'; document.getElementById('2502.10465v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.10465v1-abstract-full" style="display: none;"> Embedding watermarks into the output of generative models is essential for establishing copyright and verifiable ownership over the generated content. Emerging diffusion model watermarking methods either embed watermarks in the frequency domain or offer limited versatility of the watermark patterns in the image space, which allows simplistic detection and removal of the watermarks from the generated content. To address this issue, we propose a watermarking technique that embeds watermark features into the diffusion model itself. Our technique enables training of a paired watermark extractor for a generative model that is learned through an end-to-end process. The extractor forces the generator, during training, to effectively embed versatile, imperceptible watermarks in the generated content while simultaneously ensuring their precise recovery. We demonstrate highly accurate watermark embedding/detection and show that it is also possible to distinguish between different watermarks embedded with our method to differentiate between generative models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10465v1-abstract-full').style.display = 'none'; document.getElementById('2502.10465v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09779">arXiv:2502.09779</a> <span> [<a href="https://arxiv.org/pdf/2502.09779">pdf</a>, <a href="https://arxiv.org/format/2502.09779">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Automated Muscle and Fat Segmentation in Computed Tomography for Comprehensive Body Composition Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yaqian Chen</a>, <a href="/search/eess?searchtype=author&query=Gu%2C+H">Hanxue Gu</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yuwen Chen</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+J">Jicheng Yang</a>, <a href="/search/eess?searchtype=author&query=Dong%2C+H">Haoyu Dong</a>, <a href="/search/eess?searchtype=author&query=Cao%2C+J+Y">Joseph Y. Cao</a>, <a href="/search/eess?searchtype=author&query=Camarena%2C+A">Adrian Camarena</a>, <a href="/search/eess?searchtype=author&query=Mantyh%2C+C">Christopher Mantyh</a>, <a href="/search/eess?searchtype=author&query=Colglazier%2C+R">Roy Colglazier</a>, <a href="/search/eess?searchtype=author&query=Mazurowski%2C+M+A">Maciej A. Mazurowski</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09779v1-abstract-short" style="display: inline;"> Body composition assessment using CT images can potentially be used for a number of clinical applications, including the prognostication of cardiovascular outcomes, evaluation of metabolic health, monitoring of disease progression, assessment of nutritional status, prediction of treatment response in oncology, and risk stratification for surgical and critical care outcomes. While multiple groups h… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09779v1-abstract-full').style.display = 'inline'; document.getElementById('2502.09779v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09779v1-abstract-full" style="display: none;"> Body composition assessment using CT images can potentially be used for a number of clinical applications, including the prognostication of cardiovascular outcomes, evaluation of metabolic health, monitoring of disease progression, assessment of nutritional status, prediction of treatment response in oncology, and risk stratification for surgical and critical care outcomes. While multiple groups have developed in-house segmentation tools for this analysis, there are very limited publicly available tools that could be consistently used across different applications. To mitigate this gap, we present a publicly accessible, end-to-end segmentation and feature calculation model specifically for CT body composition analysis. Our model performs segmentation of skeletal muscle, subcutaneous adipose tissue (SAT), and visceral adipose tissue (VAT) across the chest, abdomen, and pelvis area in axial CT images. It also provides various body composition metrics, including muscle density, visceral-to-subcutaneous fat (VAT/SAT) ratio, muscle area/volume, and skeletal muscle index (SMI), supporting both 2D and 3D assessments. The model is shared for public use. To evaluate the model, the segmentation was applied to both internal and external datasets, with body composition metrics analyzed across different age, sex, and race groups. The model achieved high dice coefficients on both internal and external datasets, exceeding 89% for skeletal muscle, SAT, and VAT segmentation. The model outperforms the benchmark by 2.40% on skeletal muscle and 10.26% on SAT compared to the manual annotations given by the publicly available dataset. Body composition metrics show mean relative absolute errors (MRAEs) under 10% for all measures. Furthermore, the model provided muscular fat segmentation with a Dice coefficient of 56.27%, which can be utilized for additional analyses as needed. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09779v1-abstract-full').style.display = 'none'; document.getElementById('2502.09779v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07844">arXiv:2502.07844</a> <span> [<a href="https://arxiv.org/pdf/2502.07844">pdf</a>, <a href="https://arxiv.org/format/2502.07844">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> The establishment of static digital humans and the integration with spinal models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ju%2C+F">Fujiao Ju</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yuxuan Wang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+S">Shuo Wang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+C">Chengyin Wang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yinbo Chen</a>, <a href="/search/eess?searchtype=author&query=Li%2C+J">Jianfeng Li</a>, <a href="/search/eess?searchtype=author&query=Dong%2C+M">Mingjie Dong</a>, <a href="/search/eess?searchtype=author&query=Fang%2C+B">Bin Fang</a>, <a href="/search/eess?searchtype=author&query=Zhuang%2C+Q">Qianyu Zhuang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07844v1-abstract-short" style="display: inline;"> Adolescent idiopathic scoliosis (AIS), a prevalent spinal deformity, significantly affects individuals' health and quality of life. Conventional imaging techniques, such as X - rays, computed tomography (CT), and magnetic resonance imaging (MRI), offer static views of the spine. However, they are restricted in capturing the dynamic changes of the spine and its interactions with overall body motion… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07844v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07844v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07844v1-abstract-full" style="display: none;"> Adolescent idiopathic scoliosis (AIS), a prevalent spinal deformity, significantly affects individuals' health and quality of life. Conventional imaging techniques, such as X - rays, computed tomography (CT), and magnetic resonance imaging (MRI), offer static views of the spine. However, they are restricted in capturing the dynamic changes of the spine and its interactions with overall body motion. Therefore, developing new techniques to address these limitations has become extremely important. Dynamic digital human modeling represents a major breakthrough in digital medicine. It enables a three - dimensional (3D) view of the spine as it changes during daily activities, assisting clinicians in detecting deformities that might be missed in static imaging. Although dynamic modeling holds great potential, constructing an accurate static digital human model is a crucial initial step for high - precision simulations. In this study, our focus is on constructing an accurate static digital human model integrating the spine, which is vital for subsequent dynamic digital human research on AIS. First, we generate human point - cloud data by combining the 3D Gaussian method with the Skinned Multi - Person Linear (SMPL) model from the patient's multi - view images. Then, we fit a standard skeletal model to the generated human model. Next, we align the real spine model reconstructed from CT images with the standard skeletal model. We validated the resulting personalized spine model using X - ray data from six AIS patients, with Cobb angles (used to measure the severity of scoliosis) as evaluation metrics. The results indicate that the model's error was within 1 degree of the actual measurements. This study presents an important method for constructing digital humans. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07844v1-abstract-full').style.display = 'none'; document.getElementById('2502.07844v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07467">arXiv:2502.07467</a> <span> [<a href="https://arxiv.org/pdf/2502.07467">pdf</a>, <a href="https://arxiv.org/format/2502.07467">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Integrated Sensing, Communication, and Over-The-Air Control of UAV Swarm Dynamics </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wei%2C+Z">Zhuangkun Wei</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+W">Wenxiu Hu</a>, <a href="/search/eess?searchtype=author&query=Bouazizi%2C+Y">Yathreb Bouazizi</a>, <a href="/search/eess?searchtype=author&query=Zou%2C+M">Mengbang Zou</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+C">Chenguang Liu</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yunfei Chen</a>, <a href="/search/eess?searchtype=author&query=Sun%2C+H">Hongjian Sun</a>, <a href="/search/eess?searchtype=author&query=McCann%2C+J">Julie McCann</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07467v1-abstract-short" style="display: inline;"> Coordinated controlling a large UAV swarm requires significant spectrum resources due to the need for bandwidth allocation per UAV, posing a challenge in resource-limited environments. Over-the-air (OTA) control has emerged as a spectrum-efficient approach, leveraging electromagnetic superposition to form control signals at a base station (BS). However, existing OTA controllers lack sufficient opt… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07467v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07467v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07467v1-abstract-full" style="display: none;"> Coordinated controlling a large UAV swarm requires significant spectrum resources due to the need for bandwidth allocation per UAV, posing a challenge in resource-limited environments. Over-the-air (OTA) control has emerged as a spectrum-efficient approach, leveraging electromagnetic superposition to form control signals at a base station (BS). However, existing OTA controllers lack sufficient optimization variables to meet UAV swarm control objectives and fail to integrate control with other BS functions like sensing. This work proposes an integrated sensing and OTA control framework (ISAC-OTA) for UAV swarm. The BS performs OTA signal construction (uplink) and dispatch (downlink) while simultaneously sensing objects. Two uplink post-processing methods are developed: a control-centric approach generating closed-form control signals via a feedback-looped OTA control problem, and a sensing-centric method mitigating transmission-induced interference for accurate object sensing. For the downlink, a non-convex problem is formulated and solved to minimize control signal dispatch (transmission) error while maintaining a minimum sensing signal-to-noise ratio (SNR). Simulation results show that the proposed ISAC-OTA controller achieves control performance comparable to the benchmark optimal control algorithm while maintaining high sensing accuracy, despite OTA transmission interference. Moreover, it eliminates the need for per-UAV bandwidth allocation, showcasing a spectrum-efficient method for cooperative control in future wireless systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07467v1-abstract-full').style.display = 'none'; document.getElementById('2502.07467v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.04128">arXiv:2502.04128</a> <span> [<a href="https://arxiv.org/pdf/2502.04128">pdf</a>, <a href="https://arxiv.org/format/2502.04128">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Llasa: Scaling Train-Time and Inference-Time Compute for Llama-based Speech Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ye%2C+Z">Zhen Ye</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+X">Xinfa Zhu</a>, <a href="/search/eess?searchtype=author&query=Chan%2C+C">Chi-Min Chan</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+X">Xinsheng Wang</a>, <a href="/search/eess?searchtype=author&query=Tan%2C+X">Xu Tan</a>, <a href="/search/eess?searchtype=author&query=Lei%2C+J">Jiahe Lei</a>, <a href="/search/eess?searchtype=author&query=Peng%2C+Y">Yi Peng</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+H">Haohe Liu</a>, <a href="/search/eess?searchtype=author&query=Jin%2C+Y">Yizhu Jin</a>, <a href="/search/eess?searchtype=author&query=DAI%2C+Z">Zheqi DAI</a>, <a href="/search/eess?searchtype=author&query=Lin%2C+H">Hongzhan Lin</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+J">Jianyi Chen</a>, <a href="/search/eess?searchtype=author&query=Du%2C+X">Xingjian Du</a>, <a href="/search/eess?searchtype=author&query=Xue%2C+L">Liumeng Xue</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yunlin Chen</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Z">Zhifei Li</a>, <a href="/search/eess?searchtype=author&query=Xie%2C+L">Lei Xie</a>, <a href="/search/eess?searchtype=author&query=Kong%2C+Q">Qiuqiang Kong</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+Y">Yike Guo</a>, <a href="/search/eess?searchtype=author&query=Xue%2C+W">Wei Xue</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.04128v1-abstract-short" style="display: inline;"> Recent advances in text-based large language models (LLMs), particularly in the GPT series and the o1 model, have demonstrated the effectiveness of scaling both training-time and inference-time compute. However, current state-of-the-art TTS systems leveraging LLMs are often multi-stage, requiring separate models (e.g., diffusion models after LLM), complicating the decision of whether to scale a pa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04128v1-abstract-full').style.display = 'inline'; document.getElementById('2502.04128v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.04128v1-abstract-full" style="display: none;"> Recent advances in text-based large language models (LLMs), particularly in the GPT series and the o1 model, have demonstrated the effectiveness of scaling both training-time and inference-time compute. However, current state-of-the-art TTS systems leveraging LLMs are often multi-stage, requiring separate models (e.g., diffusion models after LLM), complicating the decision of whether to scale a particular model during training or testing. This work makes the following contributions: First, we explore the scaling of train-time and inference-time compute for speech synthesis. Second, we propose a simple framework Llasa for speech synthesis that employs a single-layer vector quantizer (VQ) codec and a single Transformer architecture to fully align with standard LLMs such as Llama. Our experiments reveal that scaling train-time compute for Llasa consistently improves the naturalness of synthesized speech and enables the generation of more complex and accurate prosody patterns. Furthermore, from the perspective of scaling inference-time compute, we employ speech understanding models as verifiers during the search, finding that scaling inference-time compute shifts the sampling modes toward the preferences of specific verifiers, thereby improving emotional expressiveness, timbre consistency, and content accuracy. In addition, we released the checkpoint and training code for our TTS model (1B, 3B, 8B) and codec model publicly available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04128v1-abstract-full').style.display = 'none'; document.getElementById('2502.04128v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.03781">arXiv:2502.03781</a> <span> [<a href="https://arxiv.org/pdf/2502.03781">pdf</a>, <a href="https://arxiv.org/ps/2502.03781">ps</a>, <a href="https://arxiv.org/format/2502.03781">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Gaze-Assisted Human-Centric Domain Adaptation for Cardiac Ultrasound Image Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+R">Ruiyi Li</a>, <a href="/search/eess?searchtype=author&query=He%2C+Y">Yuting He</a>, <a href="/search/eess?searchtype=author&query=Ge%2C+R">Rongjun Ge</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+C">Chong Wang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+D">Daoqiang Zhang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yang Chen</a>, <a href="/search/eess?searchtype=author&query=Li%2C+S">Shuo Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.03781v1-abstract-short" style="display: inline;"> Domain adaptation (DA) for cardiac ultrasound image segmentation is clinically significant and valuable. However, previous domain adaptation methods are prone to be affected by the incomplete pseudo-label and low-quality target to source images. Human-centric domain adaptation has great advantages of human cognitive guidance to help model adapt to target domain and reduce reliance on labels. Docto… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03781v1-abstract-full').style.display = 'inline'; document.getElementById('2502.03781v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.03781v1-abstract-full" style="display: none;"> Domain adaptation (DA) for cardiac ultrasound image segmentation is clinically significant and valuable. However, previous domain adaptation methods are prone to be affected by the incomplete pseudo-label and low-quality target to source images. Human-centric domain adaptation has great advantages of human cognitive guidance to help model adapt to target domain and reduce reliance on labels. Doctor gaze trajectories contains a large amount of cross-domain human guidance. To leverage gaze information and human cognition for guiding domain adaptation, we propose gaze-assisted human-centric domain adaptation (GAHCDA), which reliably guides the domain adaptation of cardiac ultrasound images. GAHCDA includes following modules: (1) Gaze Augment Alignment (GAA): GAA enables the model to obtain human cognition general features to recognize segmentation target in different domain of cardiac ultrasound images like humans. (2) Gaze Balance Loss (GBL): GBL fused gaze heatmap with outputs which makes the segmentation result structurally closer to the target domain. The experimental results illustrate that our proposed framework is able to segment cardiac ultrasound images more effectively in the target domain than GAN-based methods and other self-train based methods, showing great potential in clinical application. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03781v1-abstract-full').style.display = 'none'; document.getElementById('2502.03781v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.02071">arXiv:2502.02071</a> <span> [<a href="https://arxiv.org/pdf/2502.02071">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Sequential Multi-objective Multi-agent Reinforcement Learning Approach for Predictive Maintenance </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yan Chen</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+C">Cheng Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.02071v1-abstract-short" style="display: inline;"> Existing predictive maintenance (PdM) methods typically focus solely on whether to replace system components without considering the costs incurred by inspection. However, a well-considered approach should be able to minimize Remaining Useful Life (RUL) at engine replacement while maximizing inspection interval. To achieve this, multi-agent reinforcement learning (MARL) can be introduced. However,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02071v1-abstract-full').style.display = 'inline'; document.getElementById('2502.02071v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.02071v1-abstract-full" style="display: none;"> Existing predictive maintenance (PdM) methods typically focus solely on whether to replace system components without considering the costs incurred by inspection. However, a well-considered approach should be able to minimize Remaining Useful Life (RUL) at engine replacement while maximizing inspection interval. To achieve this, multi-agent reinforcement learning (MARL) can be introduced. However, due to the sequential and mutually constraining nature of these 2 objectives, conventional MARL is not applicable. Therefore, this paper introduces a novel framework and develops a Sequential Multi-objective Multi-agent Proximal Policy Optimization (SMOMA-PPO) algorithm. Furthermore, to provide comprehensive and effective degradation information to RL agents, we also employed Gated Recurrent Unit, quantile regression, and probability distribution fitting to develop a GRU-based RUL Prediction (GRP) model. Experiments demonstrate that the GRP method significantly improves the accuracy of RUL predictions in the later stages of system operation compared to existing methods. When incorporating its output into SMOMA-PPO, we achieve at least a 15% reduction in average RUL without unscheduled replacements (UR), nearly a 10% increase in inspection interval, and an overall decrease in maintenance costs. Importantly, our approach offers a new perspective for addressing multi-objective maintenance planning with sequential constraints, effectively enhancing system reliability and reducing maintenance expenses. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02071v1-abstract-full').style.display = 'none'; document.getElementById('2502.02071v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.01770">arXiv:2502.01770</a> <span> [<a href="https://arxiv.org/pdf/2502.01770">pdf</a>, <a href="https://arxiv.org/format/2502.01770">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Hamming Attention Distillation: Binarizing Keys and Queries for Efficient Long-Context Transformers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Horton%2C+M">Mark Horton</a>, <a href="/search/eess?searchtype=author&query=Molom-Ochir%2C+T">Tergel Molom-Ochir</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+P">Peter Liu</a>, <a href="/search/eess?searchtype=author&query=Gopal%2C+B">Bhavna Gopal</a>, <a href="/search/eess?searchtype=author&query=Wei%2C+C">Chiyue Wei</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+C">Cong Guo</a>, <a href="/search/eess?searchtype=author&query=Taylor%2C+B">Brady Taylor</a>, <a href="/search/eess?searchtype=author&query=Fan%2C+D">Deliang Fan</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+S+X">Shan X. Wang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+H">Hai Li</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yiran Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.01770v1-abstract-short" style="display: inline;"> Pre-trained transformer models with extended context windows are notoriously expensive to run at scale, often limiting real-world deployment due to their high computational and memory requirements. In this paper, we introduce Hamming Attention Distillation (HAD), a novel framework that binarizes keys and queries in the attention mechanism to achieve significant efficiency gains. By converting keys… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01770v1-abstract-full').style.display = 'inline'; document.getElementById('2502.01770v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.01770v1-abstract-full" style="display: none;"> Pre-trained transformer models with extended context windows are notoriously expensive to run at scale, often limiting real-world deployment due to their high computational and memory requirements. In this paper, we introduce Hamming Attention Distillation (HAD), a novel framework that binarizes keys and queries in the attention mechanism to achieve significant efficiency gains. By converting keys and queries into {-1, +1} vectors and replacing dot-product operations with efficient Hamming distance computations, our method drastically reduces computational overhead. Additionally, we incorporate attention matrix sparsification to prune low-impact activations, which further reduces the cost of processing long-context sequences. \par Despite these aggressive compression strategies, our distilled approach preserves a high degree of representational power, leading to substantially improved accuracy compared to prior transformer binarization methods. We evaluate HAD on a range of tasks and models, including the GLUE benchmark, ImageNet, and QuALITY, demonstrating state-of-the-art performance among binarized Transformers while drastically reducing the computational costs of long-context inference. \par We implement HAD in custom hardware simulations, demonstrating superior performance characteristics compared to a custom hardware implementation of standard attention. HAD achieves just $\mathbf{1.78}\%$ performance losses on GLUE compared to $9.08\%$ in state-of-the-art binarization work, and $\mathbf{2.5}\%$ performance losses on ImageNet compared to $12.14\%$, all while targeting custom hardware with a $\mathbf{79}\%$ area reduction and $\mathbf{87}\%$ power reduction compared to its standard attention counterpart. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01770v1-abstract-full').style.display = 'none'; document.getElementById('2502.01770v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.00700">arXiv:2502.00700</a> <span> [<a href="https://arxiv.org/pdf/2502.00700">pdf</a>, <a href="https://arxiv.org/format/2502.00700">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> S2CFormer: Reorienting Learned Image Compression from Spatial Interaction to Channel Aggregation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yunuo Chen</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Q">Qian Li</a>, <a href="/search/eess?searchtype=author&query=He%2C+B">Bing He</a>, <a href="/search/eess?searchtype=author&query=Feng%2C+D">Donghui Feng</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+R">Ronghua Wu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Q">Qi Wang</a>, <a href="/search/eess?searchtype=author&query=Song%2C+L">Li Song</a>, <a href="/search/eess?searchtype=author&query=Lu%2C+G">Guo Lu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+W">Wenjun Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.00700v2-abstract-short" style="display: inline;"> Transformers have achieved significant success in learned image compression (LIC), with Swin Transformers emerging as the mainstream choice for nonlinear transforms. A common belief is that their sophisticated spatial operations contribute most to their efficacy. However, the crucial role of the feed-forward network (FFN) based Channel Aggregation module within the transformer architecture has bee… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00700v2-abstract-full').style.display = 'inline'; document.getElementById('2502.00700v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.00700v2-abstract-full" style="display: none;"> Transformers have achieved significant success in learned image compression (LIC), with Swin Transformers emerging as the mainstream choice for nonlinear transforms. A common belief is that their sophisticated spatial operations contribute most to their efficacy. However, the crucial role of the feed-forward network (FFN) based Channel Aggregation module within the transformer architecture has been largely overlooked, and the over-design of spatial operations leads to a suboptimal trade-off between decoding latency and R-D performance. In this paper, we reevaluate the key factors behind the competence of transformers in LIC. By replacing spatial operations with identity mapping, we are surprised to find that channel operations alone can approach the R-D performance of the leading methods. This solid lower bound of performance emphasizes that the presence of channel aggregation is more essential for the LIC model to achieve competitive performance, while the previously complex spatial interactions are partly redundant. Based on this insight, we initiate the "S2CFormer" paradigm, a general architecture that reorients the focus of LIC from Spatial Interaction to Channel Aggregation. We present two instantiations of the S2CFormer: S2C-Conv, and S2C-Attention. Each one incorporates a simple operator for spatial interaction and serves as nonlinear transform blocks for our LIC models. Both models demonstrate state-of-the-art (SOTA) R-D performance and significantly faster decoding speed. These results also motivate further exploration of advanced FFN structures to enhance the R-D performance while maintaining model efficiency. With these foundations, we introduce S2C-Hybrid, an enhanced LIC model that combines the strengths of different S2CFormer instantiations. This model outperforms all the existing methods on several datasets, setting a new benchmark for efficient and high-performance LIC. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00700v2-abstract-full').style.display = 'none'; document.getElementById('2502.00700v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.18736">arXiv:2501.18736</a> <span> [<a href="https://arxiv.org/pdf/2501.18736">pdf</a>, <a href="https://arxiv.org/format/2501.18736">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Distillation-Driven Diffusion Model for Multi-Scale MRI Super-Resolution: Make 1.5T MRI Great Again </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+Z">Zhe Wang</a>, <a href="/search/eess?searchtype=author&query=Ru%2C+Y">Yuhua Ru</a>, <a href="/search/eess?searchtype=author&query=Bauer%2C+F">Fabian Bauer</a>, <a href="/search/eess?searchtype=author&query=Chetouani%2C+A">Aladine Chetouani</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+F">Fang Chen</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+L">Liping Zhang</a>, <a href="/search/eess?searchtype=author&query=Hans%2C+D">Didier Hans</a>, <a href="/search/eess?searchtype=author&query=Jennane%2C+R">Rachid Jennane</a>, <a href="/search/eess?searchtype=author&query=Jarraya%2C+M">Mohamed Jarraya</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y+H">Yung Hsin Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.18736v1-abstract-short" style="display: inline;"> Magnetic Resonance Imaging (MRI) offers critical insights into microstructural details, however, the spatial resolution of standard 1.5T imaging systems is often limited. In contrast, 7T MRI provides significantly enhanced spatial resolution, enabling finer visualization of anatomical structures. Though this, the high cost and limited availability of 7T MRI hinder its widespread use in clinical se… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18736v1-abstract-full').style.display = 'inline'; document.getElementById('2501.18736v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.18736v1-abstract-full" style="display: none;"> Magnetic Resonance Imaging (MRI) offers critical insights into microstructural details, however, the spatial resolution of standard 1.5T imaging systems is often limited. In contrast, 7T MRI provides significantly enhanced spatial resolution, enabling finer visualization of anatomical structures. Though this, the high cost and limited availability of 7T MRI hinder its widespread use in clinical settings. To address this challenge, a novel Super-Resolution (SR) model is proposed to generate 7T-like MRI from standard 1.5T MRI scans. Our approach leverages a diffusion-based architecture, incorporating gradient nonlinearity correction and bias field correction data from 7T imaging as guidance. Moreover, to improve deployability, a progressive distillation strategy is introduced. Specifically, the student model refines the 7T SR task with steps, leveraging feature maps from the inference phase of the teacher model as guidance, aiming to allow the student model to achieve progressively 7T SR performance with a smaller, deployable model size. Experimental results demonstrate that our baseline teacher model achieves state-of-the-art SR performance. The student model, while lightweight, sacrifices minimal performance. Furthermore, the student model is capable of accepting MRI inputs at varying resolutions without the need for retraining, significantly further enhancing deployment flexibility. The clinical relevance of our proposed method is validated using clinical data from Massachusetts General Hospital. Our code is available at https://github.com/ZWang78/SR. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18736v1-abstract-full').style.display = 'none'; document.getElementById('2501.18736v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.18203">arXiv:2501.18203</a> <span> [<a href="https://arxiv.org/pdf/2501.18203">pdf</a>, <a href="https://arxiv.org/format/2501.18203">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Joint Design and Pricing of Extended Warranties for Multiple Automobiles with Different Price Bands </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yajing Chen</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yanrong Li</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+X">Xiao-Lin Wang</a>, <a href="/search/eess?searchtype=author&query=Ye%2C+Z">Zhi-Sheng Ye</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.18203v1-abstract-short" style="display: inline;"> Extended warranties (EWs) are significant source of revenue for capital-intensive products like automobiles. Such products consist of multiple subsystems, providing flexibility in EW customization, for example, bundling a tailored set of subsystems in an EW contract. This, in turn, enables the creation of a service menu with different EW contract options. From the perspective of a third-party EW p… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18203v1-abstract-full').style.display = 'inline'; document.getElementById('2501.18203v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.18203v1-abstract-full" style="display: none;"> Extended warranties (EWs) are significant source of revenue for capital-intensive products like automobiles. Such products consist of multiple subsystems, providing flexibility in EW customization, for example, bundling a tailored set of subsystems in an EW contract. This, in turn, enables the creation of a service menu with different EW contract options. From the perspective of a third-party EW provider servicing a fleet of automobile brands, we develop a novel model to jointly optimize the design and pricing of EWs in order to maximize the profit. Specifically, the problem is to determine which contracts should be included in the EW menu and identify the appropriate price for each contract. As the complexity of the joint optimization problem increases exponentially with the number of subsystems, two solution approaches are devised to solve the problem. The first approach is based on a mixed-integer second-order cone programming reformulation, which guarantees optimality but is applicable only for a small number of subsystems. The second approach utilizes a two-step iteration process, offering enhanced computational efficiency in scenarios with a large number of subsystems. Through numerical experiments, the effectiveness of our model is validated, particularly in scenarios characterized by high failure rates and a large number of subsystems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18203v1-abstract-full').style.display = 'none'; document.getElementById('2501.18203v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.17059">arXiv:2501.17059</a> <span> [<a href="https://arxiv.org/pdf/2501.17059">pdf</a>, <a href="https://arxiv.org/ps/2501.17059">ps</a>, <a href="https://arxiv.org/format/2501.17059">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Channel Estimation for XL-MIMO Systems with Decentralized Baseband Processing: Integrating Local Reconstruction with Global Refinement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Tang%2C+A">Anzheng Tang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+J">Jun-Bo Wang</a>, <a href="/search/eess?searchtype=author&query=Pan%2C+Y">Yijin Pan</a>, <a href="/search/eess?searchtype=author&query=Zeng%2C+C">Cheng Zeng</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yijian Chen</a>, <a href="/search/eess?searchtype=author&query=Yu%2C+H">Hongkang Yu</a>, <a href="/search/eess?searchtype=author&query=Xiao%2C+M">Ming Xiao</a>, <a href="/search/eess?searchtype=author&query=de+Lamare%2C+R+C">Rodrigo C. de Lamare</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+J">Jiangzhou Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.17059v2-abstract-short" style="display: inline;"> In this paper, we investigate the channel estimation problem for extremely large-scale multiple-input multiple-output (XL-MIMO) systems with a hybrid analog-digital architecture, implemented within a decentralized baseband processing (DBP) framework with a star topology. Existing centralized and fully decentralized channel estimation methods face limitations due to excessive computational complexi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.17059v2-abstract-full').style.display = 'inline'; document.getElementById('2501.17059v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.17059v2-abstract-full" style="display: none;"> In this paper, we investigate the channel estimation problem for extremely large-scale multiple-input multiple-output (XL-MIMO) systems with a hybrid analog-digital architecture, implemented within a decentralized baseband processing (DBP) framework with a star topology. Existing centralized and fully decentralized channel estimation methods face limitations due to excessive computational complexity or degraded performance. To overcome these challenges, we propose a novel two-stage channel estimation scheme that integrates local sparse reconstruction with global fusion and refinement. Specifically, in the first stage, by exploiting the sparsity of channels in the angular-delay domain, the local reconstruction task is formulated as a sparse signal recovery problem. To solve it, we develop a graph neural networks-enhanced sparse Bayesian learning (SBL-GNNs) algorithm, which effectively captures dependencies among channel coefficients, significantly improving estimation accuracy. In the second stage, the local estimates from the local processing units (LPUs) are aligned into a global angular domain for fusion at the central processing unit (CPU). Based on the aggregated observations, the channel refinement is modeled as a Bayesian denoising problem. To efficiently solve it, we devise a variational message passing algorithm that incorporates a Markov chain-based hierarchical sparse prior, effectively leveraging both the sparsity and the correlations of the channels in the global angular-delay domain. Simulation results validate the effectiveness and superiority of the proposed SBL-GNNs algorithm over existing methods, demonstrating improved estimation performance and reduced computational complexity. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.17059v2-abstract-full').style.display = 'none'; document.getElementById('2501.17059v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This manuscript has been submitted to IEEE journal for possible publication</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.16368">arXiv:2501.16368</a> <span> [<a href="https://arxiv.org/pdf/2501.16368">pdf</a>, <a href="https://arxiv.org/format/2501.16368">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Foundation Models for CPS-IoT: Opportunities and Challenges </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Baris%2C+O">Ozan Baris</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yizhuo Chen</a>, <a href="/search/eess?searchtype=author&query=Dong%2C+G">Gaofeng Dong</a>, <a href="/search/eess?searchtype=author&query=Han%2C+L">Liying Han</a>, <a href="/search/eess?searchtype=author&query=Kimura%2C+T">Tomoyoshi Kimura</a>, <a href="/search/eess?searchtype=author&query=Quan%2C+P">Pengrui Quan</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+R">Ruijie Wang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+T">Tianchen Wang</a>, <a href="/search/eess?searchtype=author&query=Abdelzaher%2C+T">Tarek Abdelzaher</a>, <a href="/search/eess?searchtype=author&query=Berg%C3%A9s%2C+M">Mario Berg茅s</a>, <a href="/search/eess?searchtype=author&query=Liang%2C+P+P">Paul Pu Liang</a>, <a href="/search/eess?searchtype=author&query=Srivastava%2C+M">Mani Srivastava</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.16368v2-abstract-short" style="display: inline;"> Methods from machine learning (ML) have transformed the implementation of Perception-Cognition-Communication-Action loops in Cyber-Physical Systems (CPS) and the Internet of Things (IoT), replacing mechanistic and basic statistical models with those derived from data. However, the first generation of ML approaches, which depend on supervised learning with annotated data to create task-specific mod… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16368v2-abstract-full').style.display = 'inline'; document.getElementById('2501.16368v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.16368v2-abstract-full" style="display: none;"> Methods from machine learning (ML) have transformed the implementation of Perception-Cognition-Communication-Action loops in Cyber-Physical Systems (CPS) and the Internet of Things (IoT), replacing mechanistic and basic statistical models with those derived from data. However, the first generation of ML approaches, which depend on supervised learning with annotated data to create task-specific models, faces significant limitations in scaling to the diverse sensor modalities, deployment configurations, application tasks, and operating dynamics characterizing real-world CPS-IoT systems. The success of task-agnostic foundation models (FMs), including multimodal large language models (LLMs), in addressing similar challenges across natural language, computer vision, and human speech has generated considerable enthusiasm for and exploration of FMs and LLMs as flexible building blocks in CPS-IoT analytics pipelines, promising to reduce the need for costly task-specific engineering. Nonetheless, a significant gap persists between the current capabilities of FMs and LLMs in the CPS-IoT domain and the requirements they must meet to be viable for CPS-IoT applications. In this paper, we analyze and characterize this gap through a thorough examination of the state of the art and our research, which extends beyond it in various dimensions. Based on the results of our analysis and research, we identify essential desiderata that CPS-IoT domain-specific FMs and LLMs must satisfy to bridge this gap. We also propose actions by CPS-IoT researchers to collaborate in developing key community resources necessary for establishing FMs and LLMs as foundational tools for the next generation of CPS-IoT systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16368v2-abstract-full').style.display = 'none'; document.getElementById('2501.16368v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.15414">arXiv:2501.15414</a> <span> [<a href="https://arxiv.org/pdf/2501.15414">pdf</a>, <a href="https://arxiv.org/format/2501.15414">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Semantic Communication with Entropy-and-Channel-Adaptive Rate Control </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Chen%2C+W">Weixuan Chen</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yuhao Chen</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+Q">Qianqian Yang</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+C">Chongwen Huang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Q">Qian Wang</a>, <a href="/search/eess?searchtype=author&query=Xiong%2C+Z">Zehui Xiong</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Z">Zhaoyang Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.15414v1-abstract-short" style="display: inline;"> Traditional wireless image transmission methods struggle to balance rate efficiency and reconstruction quality under varying channel conditions. To address these challenges, we propose a novel semantic communication (SemCom) system that integrates entropy-aware and channel-adaptive mechanisms for wireless image transmission over multi-user multiple-input multiple-output (MU-MIMO) fading channels.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15414v1-abstract-full').style.display = 'inline'; document.getElementById('2501.15414v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.15414v1-abstract-full" style="display: none;"> Traditional wireless image transmission methods struggle to balance rate efficiency and reconstruction quality under varying channel conditions. To address these challenges, we propose a novel semantic communication (SemCom) system that integrates entropy-aware and channel-adaptive mechanisms for wireless image transmission over multi-user multiple-input multiple-output (MU-MIMO) fading channels. Unlike existing approaches, our system dynamically adjusts transmission rates based on the entropy of feature maps, channel state information (CSI), and signal-to-noise ratio (SNR), ensuring optimal resource utilization and robust performance. The system employs feature map pruning, channel attention, spatial attention, and multihead self-attention (MHSA) mechanisms to prioritize critical semantic features and effectively reconstruct images. Experimental results demonstrate that the proposed system outperforms state-of-the-art benchmarks, including BPG+LDPC+4QAM and Deep JSCC, in terms of rate-distortion performance, flexibility, and robustness, particularly under challenging conditions such as low SNR, imperfect CSI, and inter-user interference. This work establishes a strong foundation for adaptive-rate SemCom systems and highlights their potential for real-time, bandwidthintensive applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15414v1-abstract-full').style.display = 'none'; document.getElementById('2501.15414v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.15368">arXiv:2501.15368</a> <span> [<a href="https://arxiv.org/pdf/2501.15368">pdf</a>, <a href="https://arxiv.org/format/2501.15368">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Baichuan-Omni-1.5 Technical Report </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yadong Li</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+J">Jun Liu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+T">Tao Zhang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+T">Tao Zhang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+S">Song Chen</a>, <a href="/search/eess?searchtype=author&query=Li%2C+T">Tianpeng Li</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Z">Zehuan Li</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+L">Lijun Liu</a>, <a href="/search/eess?searchtype=author&query=Ming%2C+L">Lingfeng Ming</a>, <a href="/search/eess?searchtype=author&query=Dong%2C+G">Guosheng Dong</a>, <a href="/search/eess?searchtype=author&query=Pan%2C+D">Da Pan</a>, <a href="/search/eess?searchtype=author&query=Li%2C+C">Chong Li</a>, <a href="/search/eess?searchtype=author&query=Fang%2C+Y">Yuanbo Fang</a>, <a href="/search/eess?searchtype=author&query=Kuang%2C+D">Dongdong Kuang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+M">Mingrui Wang</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+C">Chenglin Zhu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Y">Youwei Zhang</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+H">Hongyu Guo</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+F">Fengyu Zhang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yuran Wang</a>, <a href="/search/eess?searchtype=author&query=Ding%2C+B">Bowen Ding</a>, <a href="/search/eess?searchtype=author&query=Song%2C+W">Wei Song</a>, <a href="/search/eess?searchtype=author&query=Li%2C+X">Xu Li</a>, <a href="/search/eess?searchtype=author&query=Huo%2C+Y">Yuqi Huo</a>, <a href="/search/eess?searchtype=author&query=Liang%2C+Z">Zheng Liang</a> , et al. (68 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.15368v1-abstract-short" style="display: inline;"> We introduce Baichuan-Omni-1.5, an omni-modal model that not only has omni-modal understanding capabilities but also provides end-to-end audio generation capabilities. To achieve fluent and high-quality interaction across modalities without compromising the capabilities of any modality, we prioritized optimizing three key aspects. First, we establish a comprehensive data cleaning and synthesis pip… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15368v1-abstract-full').style.display = 'inline'; document.getElementById('2501.15368v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.15368v1-abstract-full" style="display: none;"> We introduce Baichuan-Omni-1.5, an omni-modal model that not only has omni-modal understanding capabilities but also provides end-to-end audio generation capabilities. To achieve fluent and high-quality interaction across modalities without compromising the capabilities of any modality, we prioritized optimizing three key aspects. First, we establish a comprehensive data cleaning and synthesis pipeline for multimodal data, obtaining about 500B high-quality data (text, audio, and vision). Second, an audio-tokenizer (Baichuan-Audio-Tokenizer) has been designed to capture both semantic and acoustic information from audio, enabling seamless integration and enhanced compatibility with MLLM. Lastly, we designed a multi-stage training strategy that progressively integrates multimodal alignment and multitask fine-tuning, ensuring effective synergy across all modalities. Baichuan-Omni-1.5 leads contemporary models (including GPT4o-mini and MiniCPM-o 2.6) in terms of comprehensive omni-modal capabilities. Notably, it achieves results comparable to leading models such as Qwen2-VL-72B across various multimodal medical benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15368v1-abstract-full').style.display = 'none'; document.getElementById('2501.15368v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.15032">arXiv:2501.15032</a> <span> [<a href="https://arxiv.org/pdf/2501.15032">pdf</a>, <a href="https://arxiv.org/format/2501.15032">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Stealthy Voice Eavesdropping with Acoustic Metamaterials: Unraveling a New Privacy Threat </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ning%2C+Z">Zhiyuan Ning</a>, <a href="/search/eess?searchtype=author&query=Tang%2C+Z">Zhanyong Tang</a>, <a href="/search/eess?searchtype=author&query=He%2C+J">Juan He</a>, <a href="/search/eess?searchtype=author&query=Meng%2C+W">Weizhi Meng</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yuntian Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.15032v1-abstract-short" style="display: inline;"> We present SuperEar, a novel privacy threat based on acoustic metamaterials. Unlike previous research, SuperEar can surreptitiously track and eavesdrop on the phone calls of a moving outdoor target from a safe distance. To design this attack, SuperEar overcomes the challenges faced by traditional acoustic metamaterials, including low low-frequency gain and audio distortion during reconstruction. I… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15032v1-abstract-full').style.display = 'inline'; document.getElementById('2501.15032v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.15032v1-abstract-full" style="display: none;"> We present SuperEar, a novel privacy threat based on acoustic metamaterials. Unlike previous research, SuperEar can surreptitiously track and eavesdrop on the phone calls of a moving outdoor target from a safe distance. To design this attack, SuperEar overcomes the challenges faced by traditional acoustic metamaterials, including low low-frequency gain and audio distortion during reconstruction. It successfully magnifies the speech signal by approximately 20 times, allowing the sound to be captured from the earpiece of the target phone. In addition, SuperEar optimizes the trade-off between the number and size of acoustic metamaterials, improving the portability and concealability of the interceptor while ensuring effective interception performance. This makes it highly suitable for outdoor tracking and eavesdropping scenarios. Through extensive experimentation, we have evaluated SuperEar and our results show that it can achieve an eavesdropping accuracy of over 80% within a range of 4.5 meters in the aforementioned scenario, thus validating its great potential in real-world applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15032v1-abstract-full').style.display = 'none'; document.getElementById('2501.15032v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.13971">arXiv:2501.13971</a> <span> [<a href="https://arxiv.org/pdf/2501.13971">pdf</a>, <a href="https://arxiv.org/format/2501.13971">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> GS-LiDAR: Generating Realistic LiDAR Point Clouds with Panoramic Gaussian Splatting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Jiang%2C+J">Junzhe Jiang</a>, <a href="/search/eess?searchtype=author&query=Gu%2C+C">Chun Gu</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yurui Chen</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+L">Li Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.13971v2-abstract-short" style="display: inline;"> LiDAR novel view synthesis (NVS) has emerged as a novel task within LiDAR simulation, offering valuable simulated point cloud data from novel viewpoints to aid in autonomous driving systems. However, existing LiDAR NVS methods typically rely on neural radiance fields (NeRF) as their 3D representation, which incurs significant computational costs in both training and rendering. Moreover, NeRF and i… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13971v2-abstract-full').style.display = 'inline'; document.getElementById('2501.13971v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.13971v2-abstract-full" style="display: none;"> LiDAR novel view synthesis (NVS) has emerged as a novel task within LiDAR simulation, offering valuable simulated point cloud data from novel viewpoints to aid in autonomous driving systems. However, existing LiDAR NVS methods typically rely on neural radiance fields (NeRF) as their 3D representation, which incurs significant computational costs in both training and rendering. Moreover, NeRF and its variants are designed for symmetrical scenes, making them ill-suited for driving scenarios. To address these challenges, we propose GS-LiDAR, a novel framework for generating realistic LiDAR point clouds with panoramic Gaussian splatting. Our approach employs 2D Gaussian primitives with periodic vibration properties, allowing for precise geometric reconstruction of both static and dynamic elements in driving scenarios. We further introduce a novel panoramic rendering technique with explicit ray-splat intersection, guided by panoramic LiDAR supervision. By incorporating intensity and ray-drop spherical harmonic (SH) coefficients into the Gaussian primitives, we enhance the realism of the rendered point clouds. Extensive experiments on KITTI-360 and nuScenes demonstrate the superiority of our method in terms of quantitative metrics, visual quality, as well as training and rendering efficiency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13971v2-abstract-full').style.display = 'none'; document.getElementById('2501.13971v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.13006">arXiv:2501.13006</a> <span> [<a href="https://arxiv.org/pdf/2501.13006">pdf</a>, <a href="https://arxiv.org/format/2501.13006">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Terahertz Integrated Sensing Communications and Powering for 6G Wireless Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yan%2C+H">Hua Yan</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yunfei Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.13006v1-abstract-short" style="display: inline;"> The terahertz (THz) band has attracted significant interest for future wireless networks. In this paper, a THz integrated sensing communications and powering (THz-ISCAP) system, where sensing is leveraged to enhance communications and powering, is studied. For a given total amount of time, we aim to determine an optimal time allocation for sensing to improve the efficiency of communications and po… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13006v1-abstract-full').style.display = 'inline'; document.getElementById('2501.13006v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.13006v1-abstract-full" style="display: none;"> The terahertz (THz) band has attracted significant interest for future wireless networks. In this paper, a THz integrated sensing communications and powering (THz-ISCAP) system, where sensing is leveraged to enhance communications and powering, is studied. For a given total amount of time, we aim to determine an optimal time allocation for sensing to improve the efficiency of communications and powering, along with an optimal power splitting ratio to balance these two functionalities. This is achieved by maximizing between communications and powering either the achievable rate or harvested energy while ensuring a minimum requirement on the other. Numerical results indicate that the optimal system performance can be achieved by jointly optimizing the sensing time allocation and the power splitting ratio. Additionally, the results reveal the effects of various factors, such as THz frequencies and antenna aperture sizes, on the system performance. This study provides some interesting results to offer a new perspective for the research on THz-ISCAP. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13006v1-abstract-full').style.display = 'none'; document.getElementById('2501.13006v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.11842">arXiv:2501.11842</a> <span> [<a href="https://arxiv.org/pdf/2501.11842">pdf</a>, <a href="https://arxiv.org/format/2501.11842">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Harnessing Rydberg Atomic Receivers: From Quantum Physics to Wireless Communications </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yuanbin Chen</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+X">Xufeng Guo</a>, <a href="/search/eess?searchtype=author&query=Yuen%2C+C">Chau Yuen</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+Y">Yufei Zhao</a>, <a href="/search/eess?searchtype=author&query=Guan%2C+Y+L">Yong Liang Guan</a>, <a href="/search/eess?searchtype=author&query=See%2C+C+M+S">Chong Meng Samson See</a>, <a href="/search/eess?searchtype=author&query=D%C3%A9bbah%2C+M">Merouane D茅bbah</a>, <a href="/search/eess?searchtype=author&query=Hanzo%2C+L">Lajos Hanzo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.11842v1-abstract-short" style="display: inline;"> The intrinsic integration of Rydberg atomic receivers into wireless communication systems is proposed, by harnessing the principles of quantum physics in wireless communications. More particularly, we conceive a pair of Rydberg atomic receivers, one incorporates a local oscillator (LO), referred to as an LO-dressed receiver, while the other operates without an LO and is termed an LO-free receiver.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.11842v1-abstract-full').style.display = 'inline'; document.getElementById('2501.11842v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.11842v1-abstract-full" style="display: none;"> The intrinsic integration of Rydberg atomic receivers into wireless communication systems is proposed, by harnessing the principles of quantum physics in wireless communications. More particularly, we conceive a pair of Rydberg atomic receivers, one incorporates a local oscillator (LO), referred to as an LO-dressed receiver, while the other operates without an LO and is termed an LO-free receiver. The appropriate wireless model is developed for each configuration, elaborating on the receiver's responses to the radio frequency (RF) signal, on the potential noise sources, and on the system performance. Next, we investigate the association distortion effects that might occur, specifically demonstrating the boundaries of linear dynamic regions, which provides critical insights into its practical implementations in wireless systems. Extensive simulation results are provided for characterizing the performance of wireless systems, harnessing this pair of Rydberg atomic receivers. Our results demonstrate that they deliver complementary benefits: LO-free systems excel in proximity operations, while LO-dressed systems are eminently suitable for long-distance sensing at extremely low power levels. More specifically, LO-dressed systems achieve a significant signal-to-noise ratio (SNR) gain of approximately 44 dB over conventional RF receivers, exhibiting an effective coverage range extension over conventional RF receivers by a factor of 150. Furthermore, LO-dressed systems support higher-order quadrature amplitude modulation (QAM) at reduced symbol error rates (SER) compared to conventional RF receivers, hence significantly enhancing wireless communication performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.11842v1-abstract-full').style.display = 'none'; document.getElementById('2501.11842v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This manuscript has been submitted to IEEE journal, with 13 pages of body and 2 pages of supplementary material</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.10337">arXiv:2501.10337</a> <span> [<a href="https://arxiv.org/pdf/2501.10337">pdf</a>, <a href="https://arxiv.org/format/2501.10337">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Uncertainty-Aware Digital Twins: Robust Model Predictive Control using Time-Series Deep Quantile Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yi-Ping Chen</a>, <a href="/search/eess?searchtype=author&query=Tsai%2C+Y">Ying-Kuan Tsai</a>, <a href="/search/eess?searchtype=author&query=Karkaria%2C+V">Vispi Karkaria</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+W">Wei Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.10337v1-abstract-short" style="display: inline;"> Digital Twins, virtual replicas of physical systems that enable real-time monitoring, model updates, predictions, and decision-making, present novel avenues for proactive control strategies for autonomous systems. However, achieving real-time decision-making in Digital Twins considering uncertainty necessitates an efficient uncertainty quantification (UQ) approach and optimization driven by accura… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10337v1-abstract-full').style.display = 'inline'; document.getElementById('2501.10337v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.10337v1-abstract-full" style="display: none;"> Digital Twins, virtual replicas of physical systems that enable real-time monitoring, model updates, predictions, and decision-making, present novel avenues for proactive control strategies for autonomous systems. However, achieving real-time decision-making in Digital Twins considering uncertainty necessitates an efficient uncertainty quantification (UQ) approach and optimization driven by accurate predictions of system behaviors, which remains a challenge for learning-based methods. This paper presents a simultaneous multi-step robust model predictive control (MPC) framework that incorporates real-time decision-making with uncertainty awareness for Digital Twin systems. Leveraging a multistep ahead predictor named Time-Series Dense Encoder (TiDE) as the surrogate model, this framework differs from conventional MPC models that provide only one-step ahead predictions. In contrast, TiDE can predict future states within the prediction horizon in a one-shot, significantly accelerating MPC. Furthermore, quantile regression is employed with the training of TiDE to perform flexible while computationally efficient UQ on data uncertainty. Consequently, with the deep learning quantiles, the robust MPC problem is formulated into a deterministic optimization problem and provides a safety buffer that accommodates disturbances to enhance constraint satisfaction rate. As a result, the proposed method outperforms existing robust MPC methods by providing less-conservative UQ and has demonstrated efficacy in an engineering case study involving Directed Energy Deposition (DED) additive manufacturing. This proactive while uncertainty-aware control capability positions the proposed method as a potent tool for future Digital Twin applications and real-time process control in engineering systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10337v1-abstract-full').style.display = 'none'; document.getElementById('2501.10337v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.09352">arXiv:2501.09352</a> <span> [<a href="https://arxiv.org/pdf/2501.09352">pdf</a>, <a href="https://arxiv.org/format/2501.09352">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> PAL: Prompting Analytic Learning with Missing Modality for Multi-Modal Class-Incremental Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yue%2C+X">Xianghu Yue</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yiming Chen</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+X">Xueyi Zhang</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+X">Xiaoxue Gao</a>, <a href="/search/eess?searchtype=author&query=Feng%2C+M">Mengling Feng</a>, <a href="/search/eess?searchtype=author&query=Lao%2C+M">Mingrui Lao</a>, <a href="/search/eess?searchtype=author&query=Zhuang%2C+H">Huiping Zhuang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+H">Haizhou Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.09352v1-abstract-short" style="display: inline;"> Multi-modal class-incremental learning (MMCIL) seeks to leverage multi-modal data, such as audio-visual and image-text pairs, thereby enabling models to learn continuously across a sequence of tasks while mitigating forgetting. While existing studies primarily focus on the integration and utilization of multi-modal information for MMCIL, a critical challenge remains: the issue of missing modalitie… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.09352v1-abstract-full').style.display = 'inline'; document.getElementById('2501.09352v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.09352v1-abstract-full" style="display: none;"> Multi-modal class-incremental learning (MMCIL) seeks to leverage multi-modal data, such as audio-visual and image-text pairs, thereby enabling models to learn continuously across a sequence of tasks while mitigating forgetting. While existing studies primarily focus on the integration and utilization of multi-modal information for MMCIL, a critical challenge remains: the issue of missing modalities during incremental learning phases. This oversight can exacerbate severe forgetting and significantly impair model performance. To bridge this gap, we propose PAL, a novel exemplar-free framework tailored to MMCIL under missing-modality scenarios. Concretely, we devise modality-specific prompts to compensate for missing information, facilitating the model to maintain a holistic representation of the data. On this foundation, we reformulate the MMCIL problem into a Recursive Least-Squares task, delivering an analytical linear solution. Building upon these, PAL not only alleviates the inherent under-fitting limitation in analytic learning but also preserves the holistic representation of missing-modality data, achieving superior performance with less forgetting across various multi-modal incremental scenarios. Extensive experiments demonstrate that PAL significantly outperforms competitive methods across various datasets, including UPMC-Food101 and N24News, showcasing its robustness towards modality absence and its anti-forgetting ability to maintain high incremental accuracy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.09352v1-abstract-full').style.display = 'none'; document.getElementById('2501.09352v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.09289">arXiv:2501.09289</a> <span> [<a href="https://arxiv.org/pdf/2501.09289">pdf</a>, <a href="https://arxiv.org/format/2501.09289">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Control Barrier Function-Based Safety Filters: Characterization of Undesired Equilibria, Unbounded Trajectories, and Limit Cycles </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Mestres%2C+P">Pol Mestres</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yiting Chen</a>, <a href="/search/eess?searchtype=author&query=Dall%27anese%2C+E">Emiliano Dall'anese</a>, <a href="/search/eess?searchtype=author&query=Cort%C3%A9s%2C+J">Jorge Cort茅s</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.09289v1-abstract-short" style="display: inline;"> This paper focuses on safety filters designed based on Control Barrier Functions (CBFs): these are modifications of a nominal stabilizing controller typically utilized in safety-critical control applications to render a given subset of states forward invariant. The paper investigates the dynamical properties of the closed-loop systems, with a focus on characterizing undesirable behaviors that may… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.09289v1-abstract-full').style.display = 'inline'; document.getElementById('2501.09289v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.09289v1-abstract-full" style="display: none;"> This paper focuses on safety filters designed based on Control Barrier Functions (CBFs): these are modifications of a nominal stabilizing controller typically utilized in safety-critical control applications to render a given subset of states forward invariant. The paper investigates the dynamical properties of the closed-loop systems, with a focus on characterizing undesirable behaviors that may emerge due to the use of CBF-based filters. These undesirable behaviors include unbounded trajectories, limit cycles, and undesired equilibria, which can be locally stable and even form a continuum. Our analysis offer the following contributions: (i) conditions under which trajectories remain bounded and (ii) conditions under which limit cycles do not exist; (iii) we show that undesired equilibria can be characterized by solving an algebraic equation, and (iv) we provide examples that show that asymptotically stable undesired equilibria can exist for a large class of nominal controllers and design parameters of the safety filter (even for convex safe sets). Further, for the specific class of planar systems, (v) we provide explicit formulas for the total number of undesired equilibria and the proportion of saddle points and asymptotically stable equilibria, and (vi) in the case of linear planar systems, we present an exhaustive analysis of their global stability properties. Examples throughout the paper illustrate the results. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.09289v1-abstract-full').style.display = 'none'; document.getElementById('2501.09289v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.07601">arXiv:2501.07601</a> <span> [<a href="https://arxiv.org/pdf/2501.07601">pdf</a>, <a href="https://arxiv.org/format/2501.07601">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Real-Time Decision-Making for Digital Twin in Additive Manufacturing with Model Predictive Control using Time-Series Deep Neural Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yi-Ping Chen</a>, <a href="/search/eess?searchtype=author&query=Karkaria%2C+V">Vispi Karkaria</a>, <a href="/search/eess?searchtype=author&query=Tsai%2C+Y">Ying-Kuan Tsai</a>, <a href="/search/eess?searchtype=author&query=Rolark%2C+F">Faith Rolark</a>, <a href="/search/eess?searchtype=author&query=Quispe%2C+D">Daniel Quispe</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+R+X">Robert X. Gao</a>, <a href="/search/eess?searchtype=author&query=Cao%2C+J">Jian Cao</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+W">Wei Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.07601v2-abstract-short" style="display: inline;"> Digital Twin-a virtual replica of a physical system enabling real-time monitoring, model updating, prediction, and decision-making-combined with recent advances in machine learning (ML), offers new opportunities for proactive control strategies in autonomous manufacturing. However, achieving real-time decision-making with Digital Twins requires efficient optimization driven by accurate predictions… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.07601v2-abstract-full').style.display = 'inline'; document.getElementById('2501.07601v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.07601v2-abstract-full" style="display: none;"> Digital Twin-a virtual replica of a physical system enabling real-time monitoring, model updating, prediction, and decision-making-combined with recent advances in machine learning (ML), offers new opportunities for proactive control strategies in autonomous manufacturing. However, achieving real-time decision-making with Digital Twins requires efficient optimization driven by accurate predictions of highly nonlinear manufacturing systems. This paper presents a simultaneous multi-step Model Predictive Control (MPC) framework for real-time decision-making, using a multi-variate deep neural network (DNN), named Time-Series Dense Encoder (TiDE), as the surrogate model. Different from the models in conventional MPC which only provide one-step ahead prediction, TiDE is capable of predicting future states within the prediction horizon in one shot (multi-step), significantly accelerating MPC. Using Directed Energy Deposition additive manufacturing as a case study, we demonstrate the effectiveness of the proposed MPC in achieving melt pool temperature tracking to ensure part quality, while reducing porosity defects by regulating laser power to maintain melt pool depth constraints. In this work, we first show that TiDE is capable of accurately predicting melt pool temperature and depth. Second, we demonstrate that the proposed MPC achieves precise temperature tracking while satisfying melt pool depth constraints within a targeted dilution range (10%-30%), reducing potential porosity defects. Compared to the PID controller, MPC results in smoother and less fluctuating laser power profiles with competitive or superior melt pool temperature control performance. This demonstrates MPC's proactive control capabilities, leveraging time-series prediction and real-time optimization, positioning it as a powerful tool for future Digital Twin applications and real-time process optimization in manufacturing. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.07601v2-abstract-full').style.display = 'none'; document.getElementById('2501.07601v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.06595">arXiv:2501.06595</a> <span> [<a href="https://arxiv.org/pdf/2501.06595">pdf</a>, <a href="https://arxiv.org/format/2501.06595">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Fast multi-contrast MRI using joint multiscale energy model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yaghoobi%2C+N">Nima Yaghoobi</a>, <a href="/search/eess?searchtype=author&query=Chand%2C+J+R">Jyothi Rikhab Chand</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yan Chen</a>, <a href="/search/eess?searchtype=author&query=Kecskemeti%2C+S+R">Steve R. Kecskemeti</a>, <a href="/search/eess?searchtype=author&query=Holmes%2C+J+H">James H. Holmes</a>, <a href="/search/eess?searchtype=author&query=Jacob%2C+M">Mathews Jacob</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.06595v1-abstract-short" style="display: inline;"> The acquisition of 3D multicontrast MRI data with good isotropic spatial resolution is challenged by lengthy scan times. In this work, we introduce a CNN-based multiscale energy model to learn the joint probability distribution of the multi-contrast images. The joint recovery of the contrasts from undersampled data is posed as a maximum a posteriori estimation scheme, where the learned energy serv… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06595v1-abstract-full').style.display = 'inline'; document.getElementById('2501.06595v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.06595v1-abstract-full" style="display: none;"> The acquisition of 3D multicontrast MRI data with good isotropic spatial resolution is challenged by lengthy scan times. In this work, we introduce a CNN-based multiscale energy model to learn the joint probability distribution of the multi-contrast images. The joint recovery of the contrasts from undersampled data is posed as a maximum a posteriori estimation scheme, where the learned energy serves as the prior. We use a majorize-minimize algorithm to solve the optimization scheme. The proposed model leverages the redundancies across different contrasts to improve image fidelity. The proposed scheme is observed to preserve fine details and contrast, offering sharper reconstructions compared to reconstruction methods that independently recover the contrasts. While we focus on 3D MPNRAGE acquisitions in this work, the proposed approach is generalizable to arbitrary multi-contrast settings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06595v1-abstract-full').style.display = 'none'; document.getElementById('2501.06595v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.06552">arXiv:2501.06552</a> <span> [<a href="https://arxiv.org/pdf/2501.06552">pdf</a>, <a href="https://arxiv.org/format/2501.06552">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> When xURLLC Meets NOMA: A Stochastic Network Calculus Perspective </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yuang Chen</a>, <a href="/search/eess?searchtype=author&query=Lu%2C+H">Hancheng Lu</a>, <a href="/search/eess?searchtype=author&query=Qin%2C+L">Langtin Qin</a>, <a href="/search/eess?searchtype=author&query=Deng%2C+Y">Yansha Deng</a>, <a href="/search/eess?searchtype=author&query=Nallanathan%2C+A">Arumugam Nallanathan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.06552v1-abstract-short" style="display: inline;"> The advent of next-generation ultra-reliable and low-latency communications (xURLLC) presents stringent and unprecedented requirements for key performance indicators (KPIs). As a disruptive technology, non-orthogonal multiple access (NOMA) harbors the potential to fulfill these stringent KPIs essential for xURLLC. However, the immaturity of research on the tail distributions of these KPIs signific… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06552v1-abstract-full').style.display = 'inline'; document.getElementById('2501.06552v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.06552v1-abstract-full" style="display: none;"> The advent of next-generation ultra-reliable and low-latency communications (xURLLC) presents stringent and unprecedented requirements for key performance indicators (KPIs). As a disruptive technology, non-orthogonal multiple access (NOMA) harbors the potential to fulfill these stringent KPIs essential for xURLLC. However, the immaturity of research on the tail distributions of these KPIs significantly impedes the application of NOMA to xURLLC. Stochastic network calculus (SNC), as a potent methodology, is leveraged to provide dependable theoretical insights into tail distribution analysis and statistical QoS provisioning (SQP). In this article, we develop a NOMA-assisted uplink xURLLC network architecture that incorporates an SNC-based SQP theoretical framework (SNC-SQP) to support tail distribution analysis in terms of delay, age-of-information (AoI), and reliability. Based on SNC-SQP, an SQP-driven power optimization problem is proposed to minimize transmit power while guaranteeing xURLLC's KPIs on delay, AoI, reliability, and power consumption. Extensive simulations validate our proposed theoretical framework and demonstrate that the proposed power allocation scheme significantly reduces uplink transmit power and outperforms conventional schemes in terms of SQP performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06552v1-abstract-full').style.display = 'none'; document.getElementById('2501.06552v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages, 5 figures, accepted by IEEE Communications Magazine</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.06282">arXiv:2501.06282</a> <span> [<a href="https://arxiv.org/pdf/2501.06282">pdf</a>, <a href="https://arxiv.org/format/2501.06282">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> MinMo: A Multimodal Large Language Model for Seamless Voice Interaction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Chen%2C+Q">Qian Chen</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yafeng Chen</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yanni Chen</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+M">Mengzhe Chen</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yingda Chen</a>, <a href="/search/eess?searchtype=author&query=Deng%2C+C">Chong Deng</a>, <a href="/search/eess?searchtype=author&query=Du%2C+Z">Zhihao Du</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+R">Ruize Gao</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+C">Changfeng Gao</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+Z">Zhifu Gao</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yabin Li</a>, <a href="/search/eess?searchtype=author&query=Lv%2C+X">Xiang Lv</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+J">Jiaqing Liu</a>, <a href="/search/eess?searchtype=author&query=Luo%2C+H">Haoneng Luo</a>, <a href="/search/eess?searchtype=author&query=Ma%2C+B">Bin Ma</a>, <a href="/search/eess?searchtype=author&query=Ni%2C+C">Chongjia Ni</a>, <a href="/search/eess?searchtype=author&query=Shi%2C+X">Xian Shi</a>, <a href="/search/eess?searchtype=author&query=Tang%2C+J">Jialong Tang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+H">Hui Wang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+H">Hao Wang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+W">Wen Wang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yuxuan Wang</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+Y">Yunlan Xu</a>, <a href="/search/eess?searchtype=author&query=Yu%2C+F">Fan Yu</a>, <a href="/search/eess?searchtype=author&query=Yan%2C+Z">Zhijie Yan</a> , et al. (11 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.06282v1-abstract-short" style="display: inline;"> Recent advancements in large language models (LLMs) and multimodal speech-text models have laid the groundwork for seamless voice interactions, enabling real-time, natural, and human-like conversations. Previous models for voice interactions are categorized as native and aligned. Native models integrate speech and text processing in one framework but struggle with issues like differing sequence le… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06282v1-abstract-full').style.display = 'inline'; document.getElementById('2501.06282v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.06282v1-abstract-full" style="display: none;"> Recent advancements in large language models (LLMs) and multimodal speech-text models have laid the groundwork for seamless voice interactions, enabling real-time, natural, and human-like conversations. Previous models for voice interactions are categorized as native and aligned. Native models integrate speech and text processing in one framework but struggle with issues like differing sequence lengths and insufficient pre-training. Aligned models maintain text LLM capabilities but are often limited by small datasets and a narrow focus on speech tasks. In this work, we introduce MinMo, a Multimodal Large Language Model with approximately 8B parameters for seamless voice interaction. We address the main limitations of prior aligned multimodal models. We train MinMo through multiple stages of speech-to-text alignment, text-to-speech alignment, speech-to-speech alignment, and duplex interaction alignment, on 1.4 million hours of diverse speech data and a broad range of speech tasks. After the multi-stage training, MinMo achieves state-of-the-art performance across various benchmarks for voice comprehension and generation while maintaining the capabilities of text LLMs, and also facilitates full-duplex conversation, that is, simultaneous two-way communication between the user and the system. Moreover, we propose a novel and simple voice decoder that outperforms prior models in voice generation. The enhanced instruction-following capabilities of MinMo supports controlling speech generation based on user instructions, with various nuances including emotions, dialects, and speaking rates, and mimicking specific voices. For MinMo, the speech-to-text latency is approximately 100ms, full-duplex latency is approximately 600ms in theory and 800ms in practice. The MinMo project web page is https://funaudiollm.github.io/minmo, and the code and models will be released soon. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06282v1-abstract-full').style.display = 'none'; document.getElementById('2501.06282v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Work in progress. Authors are listed in alphabetical order by family name</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.05999">arXiv:2501.05999</a> <span> [<a href="https://arxiv.org/pdf/2501.05999">pdf</a>, <a href="https://arxiv.org/format/2501.05999">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> On the Sum Rate and User Fairness of STAR-RIS Aided Communications </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+H">Haochen Li</a>, <a href="/search/eess?searchtype=author&query=Mu%2C+X">Xidong Mu</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+Y">Yuanwei Liu</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yue Chen</a>, <a href="/search/eess?searchtype=author&query=Zhiwen%2C+P">Pan Zhiwen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.05999v1-abstract-short" style="display: inline;"> A simultaneously transmitting and reflecting reconfigurable intelligent surface (STAR-RIS) aided communication system is investigated. A robust joint beamforming design problem under the imperfect channel state information (CSI) is formulated to maximize the weighted sum of the Jain's fairness index and the normalized system sum rate. To solve this non-convex problem, an alternating optimization (… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.05999v1-abstract-full').style.display = 'inline'; document.getElementById('2501.05999v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.05999v1-abstract-full" style="display: none;"> A simultaneously transmitting and reflecting reconfigurable intelligent surface (STAR-RIS) aided communication system is investigated. A robust joint beamforming design problem under the imperfect channel state information (CSI) is formulated to maximize the weighted sum of the Jain's fairness index and the normalized system sum rate. To solve this non-convex problem, an alternating optimization (AO) algorithm is proposed, which leverages the S-Procedure, successive convex approximation (SCA), and semidefinite relaxation (SDR). Simulation results demonstrate that with proposed algorithm: 1) various trade-offs between sum rate and user fairness can be achieved; 2) a larger trade-off region can be achieved by adopting STAR-RIS compared to conventional RIS; and 3) the performance degradation caused by imperfect CSI is less than 7% with our proposed robust beamforming approach. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.05999v1-abstract-full').style.display = 'none'; document.getElementById('2501.05999v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.04379">arXiv:2501.04379</a> <span> [<a href="https://arxiv.org/pdf/2501.04379">pdf</a>, <a href="https://arxiv.org/format/2501.04379">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Phone-purity Guided Discrete Tokens for Dysarthric Speech Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+H">Huimeng Wang</a>, <a href="/search/eess?searchtype=author&query=Xie%2C+X">Xurong Xie</a>, <a href="/search/eess?searchtype=author&query=Geng%2C+M">Mengzhe Geng</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+S">Shujie Hu</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+H">Haoning Xu</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Youjun Chen</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Z">Zhaoqing Li</a>, <a href="/search/eess?searchtype=author&query=Deng%2C+J">Jiajun Deng</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+X">Xunying Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.04379v1-abstract-short" style="display: inline;"> Discrete tokens extracted provide efficient and domain adaptable speech features. Their application to disordered speech that exhibits articulation imprecision and large mismatch against normal voice remains unexplored. To improve their phonetic discrimination that is weakened during unsupervised K-means or vector quantization of continuous features, this paper proposes novel phone-purity guided (… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.04379v1-abstract-full').style.display = 'inline'; document.getElementById('2501.04379v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.04379v1-abstract-full" style="display: none;"> Discrete tokens extracted provide efficient and domain adaptable speech features. Their application to disordered speech that exhibits articulation imprecision and large mismatch against normal voice remains unexplored. To improve their phonetic discrimination that is weakened during unsupervised K-means or vector quantization of continuous features, this paper proposes novel phone-purity guided (PPG) discrete tokens for dysarthric speech recognition. Phonetic label supervision is used to regularize maximum likelihood and reconstruction error costs used in standard K-means and VAE-VQ based discrete token extraction. Experiments conducted on the UASpeech corpus suggest that the proposed PPG discrete token features extracted from HuBERT consistently outperform hybrid TDNN and End-to-End (E2E) Conformer systems using non-PPG based K-means or VAE-VQ tokens across varying codebook sizes by statistically significant word error rate (WER) reductions up to 0.99\% and 1.77\% absolute (3.21\% and 4.82\% relative) respectively on the UASpeech test set of 16 dysarthric speakers. The lowest WER of 23.25\% was obtained by combining systems using different token features. Consistent improvements on the phone purity metric were also achieved. T-SNE visualization further demonstrates sharper decision boundaries were produced between K-means/VAE-VQ clusters after introducing phone-purity guidance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.04379v1-abstract-full').style.display = 'none'; document.getElementById('2501.04379v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICASSP 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.04359">arXiv:2501.04359</a> <span> [<a href="https://arxiv.org/pdf/2501.04359">pdf</a>, <a href="https://arxiv.org/format/2501.04359">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Decoding EEG Speech Perception with Transformers and VAE-based Data Augmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Chen%2C+T+Y">Terrance Yu-Hao Chen</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yulin Chen</a>, <a href="/search/eess?searchtype=author&query=Soederhaell%2C+P">Pontus Soederhaell</a>, <a href="/search/eess?searchtype=author&query=Agrawal%2C+S">Sadrishya Agrawal</a>, <a href="/search/eess?searchtype=author&query=Shapovalenko%2C+K">Kateryna Shapovalenko</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.04359v1-abstract-short" style="display: inline;"> Decoding speech from non-invasive brain signals, such as electroencephalography (EEG), has the potential to advance brain-computer interfaces (BCIs), with applications in silent communication and assistive technologies for individuals with speech impairments. However, EEG-based speech decoding faces major challenges, such as noisy data, limited datasets, and poor performance on complex tasks like… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.04359v1-abstract-full').style.display = 'inline'; document.getElementById('2501.04359v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.04359v1-abstract-full" style="display: none;"> Decoding speech from non-invasive brain signals, such as electroencephalography (EEG), has the potential to advance brain-computer interfaces (BCIs), with applications in silent communication and assistive technologies for individuals with speech impairments. However, EEG-based speech decoding faces major challenges, such as noisy data, limited datasets, and poor performance on complex tasks like speech perception. This study attempts to address these challenges by employing variational autoencoders (VAEs) for EEG data augmentation to improve data quality and applying a state-of-the-art (SOTA) sequence-to-sequence deep learning architecture, originally successful in electromyography (EMG) tasks, to EEG-based speech decoding. Additionally, we adapt this architecture for word classification tasks. Using the Brennan dataset, which contains EEG recordings of subjects listening to narrated speech, we preprocess the data and evaluate both classification and sequence-to-sequence models for EEG-to-words/sentences tasks. Our experiments show that VAEs have the potential to reconstruct artificial EEG data for augmentation. Meanwhile, our sequence-to-sequence model achieves more promising performance in generating sentences compared to our classification model, though both remain challenging tasks. These findings lay the groundwork for future research on EEG speech perception decoding, with possible extensions to speech production tasks such as silent or imagined speech. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.04359v1-abstract-full').style.display = 'none'; document.getElementById('2501.04359v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">19 pages, 15 figures, 2 tables</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68T07; 92C55 <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> H.5.2; I.2.6; J.3 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.03880">arXiv:2501.03880</a> <span> [<a href="https://arxiv.org/pdf/2501.03880">pdf</a>, <a href="https://arxiv.org/format/2501.03880">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> SELMA3D challenge: Self-supervised learning for 3D light-sheet microscopy image segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Ying Chen</a>, <a href="/search/eess?searchtype=author&query=Al-Maskari%2C+R">Rami Al-Maskari</a>, <a href="/search/eess?searchtype=author&query=Horvath%2C+I">Izabela Horvath</a>, <a href="/search/eess?searchtype=author&query=Ali%2C+M">Mayar Ali</a>, <a href="/search/eess?searchtype=author&query=Hoher%2C+L">Luciano Hoher</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+K">Kaiyuan Yang</a>, <a href="/search/eess?searchtype=author&query=Lin%2C+Z">Zengming Lin</a>, <a href="/search/eess?searchtype=author&query=Zhai%2C+Z">Zhiwei Zhai</a>, <a href="/search/eess?searchtype=author&query=Shen%2C+M">Mengzhe Shen</a>, <a href="/search/eess?searchtype=author&query=Xun%2C+D">Dejin Xun</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yi Wang</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+T">Tony Xu</a>, <a href="/search/eess?searchtype=author&query=Goubran%2C+M">Maged Goubran</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+Y">Yunheng Wu</a>, <a href="/search/eess?searchtype=author&query=Mori%2C+K">Kensaku Mori</a>, <a href="/search/eess?searchtype=author&query=Paetzold%2C+J+C">Johannes C. Paetzold</a>, <a href="/search/eess?searchtype=author&query=Erturk%2C+A">Ali Erturk</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.03880v2-abstract-short" style="display: inline;"> Recent innovations in light sheet microscopy, paired with developments in tissue clearing techniques, enable the 3D imaging of large mammalian tissues with cellular resolution. Combined with the progress in large-scale data analysis, driven by deep learning, these innovations empower researchers to rapidly investigate the morphological and functional properties of diverse biological samples. Segme… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03880v2-abstract-full').style.display = 'inline'; document.getElementById('2501.03880v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.03880v2-abstract-full" style="display: none;"> Recent innovations in light sheet microscopy, paired with developments in tissue clearing techniques, enable the 3D imaging of large mammalian tissues with cellular resolution. Combined with the progress in large-scale data analysis, driven by deep learning, these innovations empower researchers to rapidly investigate the morphological and functional properties of diverse biological samples. Segmentation, a crucial preliminary step in the analysis process, can be automated using domain-specific deep learning models with expert-level performance. However, these models exhibit high sensitivity to domain shifts, leading to a significant drop in accuracy when applied to data outside their training distribution. To address this limitation, and inspired by the recent success of self-supervised learning in training generalizable models, we organized the SELMA3D Challenge during the MICCAI 2024 conference. SELMA3D provides a vast collection of light-sheet images from cleared mice and human brains, comprising 35 large 3D images-each with over 1000^3 voxels-and 315 annotated small patches for finetuning, preliminary testing and final testing. The dataset encompasses diverse biological structures, including vessel-like and spot-like structures. Five teams participated in all phases of the challenge, and their proposed methods are reviewed in this paper. Quantitative and qualitative results from most participating teams demonstrate that self-supervised learning on large datasets improves segmentation model performance and generalization. We will continue to support and extend SELMA3D as an inaugural MICCAI challenge focused on self-supervised learning for 3D microscopy image segmentation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03880v2-abstract-full').style.display = 'none'; document.getElementById('2501.03880v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">2st version</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.03689">arXiv:2501.03689</a> <span> [<a href="https://arxiv.org/pdf/2501.03689">pdf</a>, <a href="https://arxiv.org/format/2501.03689">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> MAJL: A Model-Agnostic Joint Learning Framework for Music Source Separation and Pitch Estimation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wei%2C+H">Haojie Wei</a>, <a href="/search/eess?searchtype=author&query=Yuan%2C+J">Jun Yuan</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+R">Rui Zhang</a>, <a href="/search/eess?searchtype=author&query=Dai%2C+Q">Quanyu Dai</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yueguo Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.03689v1-abstract-short" style="display: inline;"> Music source separation and pitch estimation are two vital tasks in music information retrieval. Typically, the input of pitch estimation is obtained from the output of music source separation. Therefore, existing methods have tried to perform these two tasks simultaneously, so as to leverage the mutually beneficial relationship between both tasks. However, these methods still face two critical ch… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03689v1-abstract-full').style.display = 'inline'; document.getElementById('2501.03689v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.03689v1-abstract-full" style="display: none;"> Music source separation and pitch estimation are two vital tasks in music information retrieval. Typically, the input of pitch estimation is obtained from the output of music source separation. Therefore, existing methods have tried to perform these two tasks simultaneously, so as to leverage the mutually beneficial relationship between both tasks. However, these methods still face two critical challenges that limit the improvement of both tasks: the lack of labeled data and joint learning optimization. To address these challenges, we propose a Model-Agnostic Joint Learning (MAJL) framework for both tasks. MAJL is a generic framework and can use variant models for each task. It includes a two-stage training method and a dynamic weighting method named Dynamic Weights on Hard Samples (DWHS), which addresses the lack of labeled data and joint learning optimization, respectively. Experimental results on public music datasets show that MAJL outperforms state-of-the-art methods on both tasks, with significant improvements of 0.92 in Signal-to-Distortion Ratio (SDR) for music source separation and 2.71% in Raw Pitch Accuracy (RPA) for pitch estimation. Furthermore, comprehensive studies not only validate the effectiveness of each component of MAJL, but also indicate the great generality of MAJL in adapting to different model architectures. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03689v1-abstract-full').style.display = 'none'; document.getElementById('2501.03689v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.03643">arXiv:2501.03643</a> <span> [<a href="https://arxiv.org/pdf/2501.03643">pdf</a>, <a href="https://arxiv.org/format/2501.03643">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Effective and Efficient Mixed Precision Quantization of Speech Foundation Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Xu%2C+H">Haoning Xu</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Z">Zhaoqing Li</a>, <a href="/search/eess?searchtype=author&query=Jin%2C+Z">Zengrui Jin</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+H">Huimeng Wang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Youjun Chen</a>, <a href="/search/eess?searchtype=author&query=Li%2C+G">Guinan Li</a>, <a href="/search/eess?searchtype=author&query=Geng%2C+M">Mengzhe Geng</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+S">Shujie Hu</a>, <a href="/search/eess?searchtype=author&query=Deng%2C+J">Jiajun Deng</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+X">Xunying Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.03643v2-abstract-short" style="display: inline;"> This paper presents a novel mixed-precision quantization approach for speech foundation models that tightly integrates mixed-precision learning and quantized model parameter estimation into one single model compression stage. Experiments conducted on LibriSpeech dataset with fine-tuned wav2vec2.0-base and HuBERT-large models suggest the resulting mixed-precision quantized models increased the loss… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03643v2-abstract-full').style.display = 'inline'; document.getElementById('2501.03643v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.03643v2-abstract-full" style="display: none;"> This paper presents a novel mixed-precision quantization approach for speech foundation models that tightly integrates mixed-precision learning and quantized model parameter estimation into one single model compression stage. Experiments conducted on LibriSpeech dataset with fine-tuned wav2vec2.0-base and HuBERT-large models suggest the resulting mixed-precision quantized models increased the lossless compression ratio by factors up to 1.7x and 1.9x over the respective uniform-precision and two-stage mixed-precision quantized baselines that perform precision learning and model parameters quantization in separate and disjointed stages, while incurring no statistically word error rate (WER) increase over the 32-bit full-precision models. The system compression time of wav2vec2.0-base and HuBERT-large models is reduced by up to 1.9 and 1.5 times over the two-stage mixed-precision baselines, while both produce lower WERs. The best-performing 3.5-bit mixed-precision quantized HuBERT-large model produces a lossless compression ratio of 8.6x over the 32-bit full-precision system. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03643v2-abstract-full').style.display = 'none'; document.getElementById('2501.03643v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To appear at IEEE ICASSP 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.03137">arXiv:2501.03137</a> <span> [<a href="https://arxiv.org/pdf/2501.03137">pdf</a>, <a href="https://arxiv.org/ps/2501.03137">ps</a>, <a href="https://arxiv.org/format/2501.03137">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Distributionally Robust Control Synthesis for Stochastic Systems with Safety and Reach-Avoid Specifications </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yu Chen</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yuda Li</a>, <a href="/search/eess?searchtype=author&query=Li%2C+S">Shaoyuan Li</a>, <a href="/search/eess?searchtype=author&query=Yin%2C+X">Xiang Yin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.03137v1-abstract-short" style="display: inline;"> We investigate the problem of synthesizing distributionally robust control policies for stochastic systems under safety and reach-avoid specifications. Using a game-theoretical framework, we consider the setting where the probability distribution of the disturbance at each time step is selected from an ambiguity set defined by the Wasserstein distance. The goal is to synthesize a distributionally… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03137v1-abstract-full').style.display = 'inline'; document.getElementById('2501.03137v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.03137v1-abstract-full" style="display: none;"> We investigate the problem of synthesizing distributionally robust control policies for stochastic systems under safety and reach-avoid specifications. Using a game-theoretical framework, we consider the setting where the probability distribution of the disturbance at each time step is selected from an ambiguity set defined by the Wasserstein distance. The goal is to synthesize a distributionally robust control policy that ensures the satisfaction probability exceeds a specified threshold under any distribution within the ambiguity set. First, for both safety and reach-avoid specifications, we establish the existence of optimal policies by leveraging the dynamic programming principles. Then we demonstrate how the associated optimization problem can be efficiently solved using the dual representation of Wasserstein distributionally robust optimization. Furthermore, for safety specifications in particular, we introduce a novel concept of distributionally robust control barrier certificates and show how these enable the efficient synthesis of controllers through sum-of-squares programming techniques. Finally, our experimental results reveal that incorporating distributional robustness during the synthesis phase significantly improves the satisfaction probability during online execution, even with limited statistical knowledge of the disturbance distribution. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03137v1-abstract-full').style.display = 'none'; document.getElementById('2501.03137v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.03053">arXiv:2501.03053</a> <span> [<a href="https://arxiv.org/pdf/2501.03053">pdf</a>, <a href="https://arxiv.org/format/2501.03053">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Dr. Tongue: Sign-Oriented Multi-label Detection for Remote Tongue Diagnosis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yiliang Chen</a>, <a href="/search/eess?searchtype=author&query=Ho%2C+S+S">Steven SC Ho</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+C">Cheng Xu</a>, <a href="/search/eess?searchtype=author&query=Xie%2C+Y+J">Yao Jie Xie</a>, <a href="/search/eess?searchtype=author&query=Yeung%2C+W">Wing-Fai Yeung</a>, <a href="/search/eess?searchtype=author&query=He%2C+S">Shengfeng He</a>, <a href="/search/eess?searchtype=author&query=Qin%2C+J">Jing Qin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.03053v2-abstract-short" style="display: inline;"> Tongue diagnosis is a vital tool in Western and Traditional Chinese Medicine, providing key insights into a patient's health by analyzing tongue attributes. The COVID-19 pandemic has heightened the need for accurate remote medical assessments, emphasizing the importance of precise tongue attribute recognition via telehealth. To address this, we propose a Sign-Oriented multi-label Attributes Detect… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03053v2-abstract-full').style.display = 'inline'; document.getElementById('2501.03053v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.03053v2-abstract-full" style="display: none;"> Tongue diagnosis is a vital tool in Western and Traditional Chinese Medicine, providing key insights into a patient's health by analyzing tongue attributes. The COVID-19 pandemic has heightened the need for accurate remote medical assessments, emphasizing the importance of precise tongue attribute recognition via telehealth. To address this, we propose a Sign-Oriented multi-label Attributes Detection framework. Our approach begins with an adaptive tongue feature extraction module that standardizes tongue images and mitigates environmental factors. This is followed by a Sign-oriented Network (SignNet) that identifies specific tongue attributes, emulating the diagnostic process of experienced practitioners and enabling comprehensive health evaluations. To validate our methodology, we developed an extensive tongue image dataset specifically designed for telemedicine. Unlike existing datasets, ours is tailored for remote diagnosis, with a comprehensive set of attribute labels. This dataset will be openly available, providing a valuable resource for research. Initial tests have shown improved accuracy in detecting various tongue attributes, highlighting our framework's potential as an essential tool for remote medical assessments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03053v2-abstract-full').style.display = 'none'; document.getElementById('2501.03053v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.02786">arXiv:2501.02786</a> <span> [<a href="https://arxiv.org/pdf/2501.02786">pdf</a>, <a href="https://arxiv.org/format/2501.02786">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> CCStereo: Audio-Visual Contextual and Contrastive Learning for Binaural Audio Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yuanhong Chen</a>, <a href="/search/eess?searchtype=author&query=Shimada%2C+K">Kazuki Shimada</a>, <a href="/search/eess?searchtype=author&query=Simon%2C+C">Christian Simon</a>, <a href="/search/eess?searchtype=author&query=Ikemiya%2C+Y">Yukara Ikemiya</a>, <a href="/search/eess?searchtype=author&query=Shibuya%2C+T">Takashi Shibuya</a>, <a href="/search/eess?searchtype=author&query=Mitsufuji%2C+Y">Yuki Mitsufuji</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.02786v1-abstract-short" style="display: inline;"> Binaural audio generation (BAG) aims to convert monaural audio to stereo audio using visual prompts, requiring a deep understanding of spatial and semantic information. However, current models risk overfitting to room environments and lose fine-grained spatial details. In this paper, we propose a new audio-visual binaural generation model incorporating an audio-visual conditional normalisation lay… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02786v1-abstract-full').style.display = 'inline'; document.getElementById('2501.02786v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.02786v1-abstract-full" style="display: none;"> Binaural audio generation (BAG) aims to convert monaural audio to stereo audio using visual prompts, requiring a deep understanding of spatial and semantic information. However, current models risk overfitting to room environments and lose fine-grained spatial details. In this paper, we propose a new audio-visual binaural generation model incorporating an audio-visual conditional normalisation layer that dynamically aligns the mean and variance of the target difference audio features using visual context, along with a new contrastive learning method to enhance spatial sensitivity by mining negative samples from shuffled visual features. We also introduce a cost-efficient way to utilise test-time augmentation in video data to enhance performance. Our approach achieves state-of-the-art generation accuracy on the FAIR-Play and MUSIC-Stereo benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02786v1-abstract-full').style.display = 'none'; document.getElementById('2501.02786v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.02730">arXiv:2501.02730</a> <span> [<a href="https://arxiv.org/pdf/2501.02730">pdf</a>, <a href="https://arxiv.org/format/2501.02730">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> New Paradigm for Unified Near-Field and Far-Field Wireless Communications </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+Z">Zhaocheng Wang</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+H">Haochen Wu</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yuanbin Chen</a>, <a href="/search/eess?searchtype=author&query=Lu%2C+L">Liyang Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.02730v1-abstract-short" style="display: inline;"> Current Type I and Type II codebooks in fifth generation (5G) wireless communications are limited in supporting the coexistence of far-field and near-field user equipments, as they are exclusively designed for far-field scenarios. To fill this knowledge gap and encourage relevant proposals by the 3rd Generation Partnership Project (3GPP), this article provides a novel codebook to facilitate a unif… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02730v1-abstract-full').style.display = 'inline'; document.getElementById('2501.02730v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.02730v1-abstract-full" style="display: none;"> Current Type I and Type II codebooks in fifth generation (5G) wireless communications are limited in supporting the coexistence of far-field and near-field user equipments, as they are exclusively designed for far-field scenarios. To fill this knowledge gap and encourage relevant proposals by the 3rd Generation Partnership Project (3GPP), this article provides a novel codebook to facilitate a unified paradigm for the coexistence of far-field and near-field contexts. It ensures efficient precoding for all user equipments (UEs), while removing the need for the base station to identify whether one specific UE stays in either near-field or far-field regions. Additionally, our proposed codebook ensures compliance with current 3GPP standards for working flow and reference signals. Simulation results demonstrate the superior performance and versatility of our proposed codebook, validating its effectiveness in unifying near-field and far-field precoding for sixth-generation (6G) multiple-input multiple-output (MIMO) systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02730v1-abstract-full').style.display = 'none'; document.getElementById('2501.02730v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper has been submitted to IEEE Network magazine, and is in revision</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.01384">arXiv:2501.01384</a> <span> [<a href="https://arxiv.org/pdf/2501.01384">pdf</a>, <a href="https://arxiv.org/format/2501.01384">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> OmniChat: Enhancing Spoken Dialogue Systems with Scalable Synthetic Data for Diverse Scenarios </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Cheng%2C+X">Xize Cheng</a>, <a href="/search/eess?searchtype=author&query=Fu%2C+D">Dongjie Fu</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+X">Xiaoda Yang</a>, <a href="/search/eess?searchtype=author&query=Fang%2C+M">Minghui Fang</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+R">Ruofan Hu</a>, <a href="/search/eess?searchtype=author&query=Lu%2C+J">Jingyu Lu</a>, <a href="/search/eess?searchtype=author&query=Jionghao%2C+B">Bai Jionghao</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Z">Zehan Wang</a>, <a href="/search/eess?searchtype=author&query=Ji%2C+S">Shengpeng Ji</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+R">Rongjie Huang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+L">Linjun Li</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yu Chen</a>, <a href="/search/eess?searchtype=author&query=Jin%2C+T">Tao Jin</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+Z">Zhou Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.01384v1-abstract-short" style="display: inline;"> With the rapid development of large language models, researchers have created increasingly advanced spoken dialogue systems that can naturally converse with humans. However, these systems still struggle to handle the full complexity of real-world conversations, including audio events, musical contexts, and emotional expressions, mainly because current dialogue datasets are constrained in both scal… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.01384v1-abstract-full').style.display = 'inline'; document.getElementById('2501.01384v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.01384v1-abstract-full" style="display: none;"> With the rapid development of large language models, researchers have created increasingly advanced spoken dialogue systems that can naturally converse with humans. However, these systems still struggle to handle the full complexity of real-world conversations, including audio events, musical contexts, and emotional expressions, mainly because current dialogue datasets are constrained in both scale and scenario diversity. In this paper, we propose leveraging synthetic data to enhance the dialogue models across diverse scenarios. We introduce ShareChatX, the first comprehensive, large-scale dataset for spoken dialogue that spans diverse scenarios. Based on this dataset, we introduce OmniChat, a multi-turn dialogue system with a heterogeneous feature fusion module, designed to optimize feature selection in different dialogue contexts. In addition, we explored critical aspects of training dialogue systems using synthetic data. Through comprehensive experimentation, we determined the ideal balance between synthetic and real data, achieving state-of-the-art results on the real-world dialogue dataset DailyTalk. We also highlight the crucial importance of synthetic data in tackling diverse, complex dialogue scenarios, especially those involving audio and music. For more details, please visit our demo page at \url{https://sharechatx.github.io/}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.01384v1-abstract-full').style.display = 'none'; document.getElementById('2501.01384v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.20083">arXiv:2412.20083</a> <span> [<a href="https://arxiv.org/pdf/2412.20083">pdf</a>, <a href="https://arxiv.org/format/2412.20083">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Achieving Full-Bandwidth Sensing Performance with Partial Bandwidth Allocation for ISAC </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Xiao%2C+Z">Zhiqiang Xiao</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+Z">Zhiwen Zhou</a>, <a href="/search/eess?searchtype=author&query=Dai%2C+Q">Qianglong Dai</a>, <a href="/search/eess?searchtype=author&query=Zeng%2C+Y">Yong Zeng</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+F">Fei Yang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yan Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.20083v1-abstract-short" style="display: inline;"> This letter studies an uplink integrated sensing and communication (ISAC) system using discrete Fourier transform spread orthogonal frequency division multiplexing (DFT-s-OFDM) transmission. We try to answer the following fundamental question: With only a fractional bandwidth allocated to the user with sensing task, can the same delay resolution and unambiguous range be achieved as if all bandwidt… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.20083v1-abstract-full').style.display = 'inline'; document.getElementById('2412.20083v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.20083v1-abstract-full" style="display: none;"> This letter studies an uplink integrated sensing and communication (ISAC) system using discrete Fourier transform spread orthogonal frequency division multiplexing (DFT-s-OFDM) transmission. We try to answer the following fundamental question: With only a fractional bandwidth allocated to the user with sensing task, can the same delay resolution and unambiguous range be achieved as if all bandwidth were allocated to it? We affirmatively answer the question by proposing a novel two-stage delay estimation (TSDE) method that exploits the following facts: without increasing the allocated bandwidth, higher delay resolution can be achieved via distributed subcarrier allocation compared to its collocated counterpart, while there is a trade-off between delay resolution and unambiguous range by varying the decimation factor of subcarriers. Therefore, the key idea of the proposed TSDE method is to first perform coarse delay estimation with collocated subcarriers to achieve a large unambiguous range, and then use distributed subcarriers with optimized decimation factor to enhance delay resolution while avoiding delay ambiguity. Our analysis shows that the proposed TSDE method can achieve the full-bandwidth delay resolution and unambiguous range, by using only at most half of the full bandwidth, provided that the channel delay spread is less than half of the unambiguous range. Numerical results show the superiority of the proposed method over the conventional method with collocated subcarriers. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.20083v1-abstract-full').style.display = 'none'; document.getElementById('2412.20083v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.20032">arXiv:2412.20032</a> <span> [<a href="https://arxiv.org/pdf/2412.20032">pdf</a>, <a href="https://arxiv.org/format/2412.20032">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Online Low-Carbon Workload, Energy, and Temperature Management of Distributed Data Centers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Xie%2C+R">Rui Xie</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yue Chen</a>, <a href="/search/eess?searchtype=author&query=Weng%2C+X">Xi Weng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.20032v1-abstract-short" style="display: inline;"> Data centers have become one of the major energy consumers, making their low-carbon operations critical to achieving global carbon neutrality. Although distributed data centers have the potential to reduce costs and emissions through cooperation, they are facing challenges due to uncertainties. This paper proposes an online approach to co-optimize the workload, energy, and temperature strategies a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.20032v1-abstract-full').style.display = 'inline'; document.getElementById('2412.20032v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.20032v1-abstract-full" style="display: none;"> Data centers have become one of the major energy consumers, making their low-carbon operations critical to achieving global carbon neutrality. Although distributed data centers have the potential to reduce costs and emissions through cooperation, they are facing challenges due to uncertainties. This paper proposes an online approach to co-optimize the workload, energy, and temperature strategies across distributed data centers, targeting minimal total cost, controlled carbon emissions, and adherence to operational constraints. Lyapunov optimization technique is adopted to derive a parametric real-time strategy that accommodates uncertainties in workload demands, ambient temperature, electricity prices, and carbon intensities, without requiring prior knowledge of their distributions. A theoretical upper bound for the optimality gap is derived, based on which a linear programming problem is proposed to optimize the strategy parameters, enhancing performance while ensuring operational constraints. Case studies and method comparison validate the proposed method's effectiveness in reducing costs and carbon emissions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.20032v1-abstract-full').style.display = 'none'; document.getElementById('2412.20032v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted by The 4th Energy Conversion and Economics Annual Forum</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.19996">arXiv:2412.19996</a> <span> [<a href="https://arxiv.org/pdf/2412.19996">pdf</a>, <a href="https://arxiv.org/format/2412.19996">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Embodied AI-empowered Low Altitude Economy: Integrated Sensing, Communications, Computation, and Control (ISC3) </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yang%2C+Y">Yaoqi Yang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yong Chen</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+J">Jiacheng Wang</a>, <a href="/search/eess?searchtype=author&query=Sun%2C+G">Geng Sun</a>, <a href="/search/eess?searchtype=author&query=Niyato%2C+D">Dusit Niyato</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.19996v1-abstract-short" style="display: inline;"> Low altitude economy (LAE) holds immense potential to drive urban development across various sectors. However, LAE also faces challenges in data collection and processing efficiency, flight control precision, and network performance. The challenges could be solved by realizing an integration of sensing, communications, computation, and control (ISC3) for LAE. In this regard, embodied artificial in… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.19996v1-abstract-full').style.display = 'inline'; document.getElementById('2412.19996v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.19996v1-abstract-full" style="display: none;"> Low altitude economy (LAE) holds immense potential to drive urban development across various sectors. However, LAE also faces challenges in data collection and processing efficiency, flight control precision, and network performance. The challenges could be solved by realizing an integration of sensing, communications, computation, and control (ISC3) for LAE. In this regard, embodied artificial intelligence (EAI), with its unique perception, planning, and decision-making capabilities, offers a promising solution to realize ISC3. Specifically, this paper investigates an application of EAI into ISC3 to support LAE, exploring potential research focuses, solutions, and case study. We begin by outlining rationales and benefits of introducing EAI into LAE, followed by reviewing research directions and solutions for EAI in ISC3. We then propose a framework of an EAI-enabled ISC3 for LAE. The framework's effectiveness is evaluated through a case study of express delivery utilizing an EAI-enabled UAV. Finally, we discuss several future research directions for advancing EAI-enabled LAE. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.19996v1-abstract-full').style.display = 'none'; document.getElementById('2412.19996v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.19026">arXiv:2412.19026</a> <span> [<a href="https://arxiv.org/pdf/2412.19026">pdf</a>, <a href="https://arxiv.org/format/2412.19026">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Modality-Projection Universal Model for Comprehensive Full-Body Medical Imaging Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yixin Chen</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+L">Lin Gao</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+Y">Yajuan Gao</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+R">Rui Wang</a>, <a href="/search/eess?searchtype=author&query=Lian%2C+J">Jingge Lian</a>, <a href="/search/eess?searchtype=author&query=Meng%2C+X">Xiangxi Meng</a>, <a href="/search/eess?searchtype=author&query=Duan%2C+Y">Yanhua Duan</a>, <a href="/search/eess?searchtype=author&query=Chai%2C+L">Leiying Chai</a>, <a href="/search/eess?searchtype=author&query=Han%2C+H">Hongbin Han</a>, <a href="/search/eess?searchtype=author&query=Cheng%2C+Z">Zhaoping Cheng</a>, <a href="/search/eess?searchtype=author&query=Xie%2C+Z">Zhaoheng Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.19026v1-abstract-short" style="display: inline;"> The integration of deep learning in medical imaging has shown great promise for enhancing diagnostic, therapeutic, and research outcomes. However, applying universal models across multiple modalities remains challenging due to the inherent variability in data characteristics. This study aims to introduce and evaluate a Modality Projection Universal Model (MPUM). MPUM employs a novel modality-proje… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.19026v1-abstract-full').style.display = 'inline'; document.getElementById('2412.19026v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.19026v1-abstract-full" style="display: none;"> The integration of deep learning in medical imaging has shown great promise for enhancing diagnostic, therapeutic, and research outcomes. However, applying universal models across multiple modalities remains challenging due to the inherent variability in data characteristics. This study aims to introduce and evaluate a Modality Projection Universal Model (MPUM). MPUM employs a novel modality-projection strategy, which allows the model to dynamically adjust its parameters to optimize performance across different imaging modalities. The MPUM demonstrated superior accuracy in identifying anatomical structures, enabling precise quantification for improved clinical decision-making. It also identifies metabolic associations within the brain-body axis, advancing research on brain-body physiological correlations. Furthermore, MPUM's unique controller-based convolution layer enables visualization of saliency maps across all network layers, significantly enhancing the model's interpretability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.19026v1-abstract-full').style.display = 'none'; document.getElementById('2412.19026v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.18619">arXiv:2412.18619</a> <span> [<a href="https://arxiv.org/pdf/2412.18619">pdf</a>, <a href="https://arxiv.org/format/2412.18619">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Next Token Prediction Towards Multimodal Intelligence: A Comprehensive Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Chen%2C+L">Liang Chen</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Z">Zekun Wang</a>, <a href="/search/eess?searchtype=author&query=Ren%2C+S">Shuhuai Ren</a>, <a href="/search/eess?searchtype=author&query=Li%2C+L">Lei Li</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+H">Haozhe Zhao</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yunshui Li</a>, <a href="/search/eess?searchtype=author&query=Cai%2C+Z">Zefan Cai</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+H">Hongcheng Guo</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+L">Lei Zhang</a>, <a href="/search/eess?searchtype=author&query=Xiong%2C+Y">Yizhe Xiong</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Y">Yichi Zhang</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+R">Ruoyu Wu</a>, <a href="/search/eess?searchtype=author&query=Dong%2C+Q">Qingxiu Dong</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+G">Ge Zhang</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+J">Jian Yang</a>, <a href="/search/eess?searchtype=author&query=Meng%2C+L">Lingwei Meng</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+S">Shujie Hu</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yulong Chen</a>, <a href="/search/eess?searchtype=author&query=Lin%2C+J">Junyang Lin</a>, <a href="/search/eess?searchtype=author&query=Bai%2C+S">Shuai Bai</a>, <a href="/search/eess?searchtype=author&query=Vlachos%2C+A">Andreas Vlachos</a>, <a href="/search/eess?searchtype=author&query=Tan%2C+X">Xu Tan</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+M">Minjia Zhang</a>, <a href="/search/eess?searchtype=author&query=Xiao%2C+W">Wen Xiao</a>, <a href="/search/eess?searchtype=author&query=Yee%2C+A">Aaron Yee</a> , et al. (2 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.18619v2-abstract-short" style="display: inline;"> Building on the foundations of language modeling in natural language processing, Next Token Prediction (NTP) has evolved into a versatile training objective for machine learning tasks across various modalities, achieving considerable success. As Large Language Models (LLMs) have advanced to unify understanding and generation tasks within the textual modality, recent research has shown that tasks f… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18619v2-abstract-full').style.display = 'inline'; document.getElementById('2412.18619v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.18619v2-abstract-full" style="display: none;"> Building on the foundations of language modeling in natural language processing, Next Token Prediction (NTP) has evolved into a versatile training objective for machine learning tasks across various modalities, achieving considerable success. As Large Language Models (LLMs) have advanced to unify understanding and generation tasks within the textual modality, recent research has shown that tasks from different modalities can also be effectively encapsulated within the NTP framework, transforming the multimodal information into tokens and predict the next one given the context. This survey introduces a comprehensive taxonomy that unifies both understanding and generation within multimodal learning through the lens of NTP. The proposed taxonomy covers five key aspects: Multimodal tokenization, MMNTP model architectures, unified task representation, datasets \& evaluation, and open challenges. This new taxonomy aims to aid researchers in their exploration of multimodal intelligence. An associated GitHub repository collecting the latest papers and repos is available at https://github.com/LMM101/Awesome-Multimodal-Next-Token-Prediction <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18619v2-abstract-full').style.display = 'none'; document.getElementById('2412.18619v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">69 papes, 18 figures, repo at https://github.com/LMM101/Awesome-Multimodal-Next-Token-Prediction</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.18342">arXiv:2412.18342</a> <span> [<a href="https://arxiv.org/pdf/2412.18342">pdf</a>, <a href="https://arxiv.org/format/2412.18342">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Mitigating Label Noise using Prompt-Based Hyperbolic Meta-Learning in Open-Set Domain Generalization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Peng%2C+K">Kunyu Peng</a>, <a href="/search/eess?searchtype=author&query=Wen%2C+D">Di Wen</a>, <a href="/search/eess?searchtype=author&query=Saquib%2C+S+M">Sarfraz M. Saquib</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yufan Chen</a>, <a href="/search/eess?searchtype=author&query=Zheng%2C+J">Junwei Zheng</a>, <a href="/search/eess?searchtype=author&query=Schneider%2C+D">David Schneider</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+K">Kailun Yang</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+J">Jiamin Wu</a>, <a href="/search/eess?searchtype=author&query=Roitberg%2C+A">Alina Roitberg</a>, <a href="/search/eess?searchtype=author&query=Stiefelhagen%2C+R">Rainer Stiefelhagen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.18342v1-abstract-short" style="display: inline;"> Open-Set Domain Generalization (OSDG) is a challenging task requiring models to accurately predict familiar categories while minimizing confidence for unknown categories to effectively reject them in unseen domains. While the OSDG field has seen considerable advancements, the impact of label noise--a common issue in real-world datasets--has been largely overlooked. Label noise can mislead model op… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18342v1-abstract-full').style.display = 'inline'; document.getElementById('2412.18342v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.18342v1-abstract-full" style="display: none;"> Open-Set Domain Generalization (OSDG) is a challenging task requiring models to accurately predict familiar categories while minimizing confidence for unknown categories to effectively reject them in unseen domains. While the OSDG field has seen considerable advancements, the impact of label noise--a common issue in real-world datasets--has been largely overlooked. Label noise can mislead model optimization, thereby exacerbating the challenges of open-set recognition in novel domains. In this study, we take the first step towards addressing Open-Set Domain Generalization under Noisy Labels (OSDG-NL) by constructing dedicated benchmarks derived from widely used OSDG datasets, including PACS and DigitsDG. We evaluate baseline approaches by integrating techniques from both label denoising and OSDG methodologies, highlighting the limitations of existing strategies in handling label noise effectively. To address these limitations, we propose HyProMeta, a novel framework that integrates hyperbolic category prototypes for label noise-aware meta-learning alongside a learnable new-category agnostic prompt designed to enhance generalization to unseen classes. Our extensive experiments demonstrate the superior performance of HyProMeta compared to state-of-the-art methods across the newly established benchmarks. The source code of this work is released at https://github.com/KPeng9510/HyProMeta. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18342v1-abstract-full').style.display = 'none'; document.getElementById('2412.18342v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The source code of this work is released at https://github.com/KPeng9510/HyProMeta</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.13917">arXiv:2412.13917</a> <span> [<a href="https://arxiv.org/pdf/2412.13917">pdf</a>, <a href="https://arxiv.org/format/2412.13917">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Speech Watermarking with Discrete Intermediate Representations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ji%2C+S">Shengpeng Ji</a>, <a href="/search/eess?searchtype=author&query=Jiang%2C+Z">Ziyue Jiang</a>, <a href="/search/eess?searchtype=author&query=Zuo%2C+J">Jialong Zuo</a>, <a href="/search/eess?searchtype=author&query=Fang%2C+M">Minghui Fang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yifu Chen</a>, <a href="/search/eess?searchtype=author&query=Jin%2C+T">Tao Jin</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+Z">Zhou Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.13917v1-abstract-short" style="display: inline;"> Speech watermarking techniques can proactively mitigate the potential harmful consequences of instant voice cloning techniques. These techniques involve the insertion of signals into speech that are imperceptible to humans but can be detected by algorithms. Previous approaches typically embed watermark messages into continuous space. However, intuitively, embedding watermark information into robus… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13917v1-abstract-full').style.display = 'inline'; document.getElementById('2412.13917v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.13917v1-abstract-full" style="display: none;"> Speech watermarking techniques can proactively mitigate the potential harmful consequences of instant voice cloning techniques. These techniques involve the insertion of signals into speech that are imperceptible to humans but can be detected by algorithms. Previous approaches typically embed watermark messages into continuous space. However, intuitively, embedding watermark information into robust discrete latent space can significantly improve the robustness of watermarking systems. In this paper, we propose DiscreteWM, a novel speech watermarking framework that injects watermarks into the discrete intermediate representations of speech. Specifically, we map speech into discrete latent space with a vector-quantized autoencoder and inject watermarks by changing the modular arithmetic relation of discrete IDs. To ensure the imperceptibility of watermarks, we also propose a manipulator model to select the candidate tokens for watermark embedding. Experimental results demonstrate that our framework achieves state-of-the-art performance in robustness and imperceptibility, simultaneously. Moreover, our flexible frame-wise approach can serve as an efficient solution for both voice cloning detection and information hiding. Additionally, DiscreteWM can encode 1 to 150 bits of watermark information within a 1-second speech clip, indicating its encoding capacity. Audio samples are available at https://DiscreteWM.github.io/discrete_wm. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13917v1-abstract-full').style.display = 'none'; document.getElementById('2412.13917v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by AAAI 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.13387">arXiv:2412.13387</a> <span> [<a href="https://arxiv.org/pdf/2412.13387">pdf</a>, <a href="https://arxiv.org/format/2412.13387">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Deep Speech Synthesis from Multimodal Articulatory Representations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wu%2C+P">Peter Wu</a>, <a href="/search/eess?searchtype=author&query=Yu%2C+B">Bohan Yu</a>, <a href="/search/eess?searchtype=author&query=Scheck%2C+K">Kevin Scheck</a>, <a href="/search/eess?searchtype=author&query=Black%2C+A+W">Alan W Black</a>, <a href="/search/eess?searchtype=author&query=Krishnapriyan%2C+A+S">Aditi S. Krishnapriyan</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+I+Y">Irene Y. Chen</a>, <a href="/search/eess?searchtype=author&query=Schultz%2C+T">Tanja Schultz</a>, <a href="/search/eess?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a>, <a href="/search/eess?searchtype=author&query=Anumanchipalli%2C+G+K">Gopala K. Anumanchipalli</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.13387v1-abstract-short" style="display: inline;"> The amount of articulatory data available for training deep learning models is much less compared to acoustic speech data. In order to improve articulatory-to-acoustic synthesis performance in these low-resource settings, we propose a multimodal pre-training framework. On single-speaker speech synthesis tasks from real-time magnetic resonance imaging and surface electromyography inputs, the intell… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13387v1-abstract-full').style.display = 'inline'; document.getElementById('2412.13387v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.13387v1-abstract-full" style="display: none;"> The amount of articulatory data available for training deep learning models is much less compared to acoustic speech data. In order to improve articulatory-to-acoustic synthesis performance in these low-resource settings, we propose a multimodal pre-training framework. On single-speaker speech synthesis tasks from real-time magnetic resonance imaging and surface electromyography inputs, the intelligibility of synthesized outputs improves noticeably. For example, compared to prior work, utilizing our proposed transfer learning methods improves the MRI-to-speech performance by 36% word error rate. In addition to these intelligibility results, our multimodal pre-trained models consistently outperform unimodal baselines on three objective and subjective synthesis quality metrics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13387v1-abstract-full').style.display = 'none'; document.getElementById('2412.13387v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.12009">arXiv:2412.12009</a> <span> [<a href="https://arxiv.org/pdf/2412.12009">pdf</a>, <a href="https://arxiv.org/format/2412.12009">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> SpeechPrune: Context-aware Token Pruning for Speech Information Retrieval </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Lin%2C+Y">Yueqian Lin</a>, <a href="/search/eess?searchtype=author&query=Fu%2C+Y">Yuzhe Fu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+J">Jingyang Zhang</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+Y">Yudong Liu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+J">Jianyi Zhang</a>, <a href="/search/eess?searchtype=author&query=Sun%2C+J">Jingwei Sun</a>, <a href="/search/eess?searchtype=author&query=Li%2C+H+%22">Hai "Helen" Li</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yiran Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.12009v1-abstract-short" style="display: inline;"> We introduce Speech Information Retrieval (SIR), a new long-context task for Speech Large Language Models (Speech LLMs), and present SPIRAL, a 1,012-sample benchmark testing models' ability to extract critical details from approximately 90-second spoken inputs. While current Speech LLMs excel at short-form tasks, they struggle with the computational and representational demands of longer audio seq… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.12009v1-abstract-full').style.display = 'inline'; document.getElementById('2412.12009v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.12009v1-abstract-full" style="display: none;"> We introduce Speech Information Retrieval (SIR), a new long-context task for Speech Large Language Models (Speech LLMs), and present SPIRAL, a 1,012-sample benchmark testing models' ability to extract critical details from approximately 90-second spoken inputs. While current Speech LLMs excel at short-form tasks, they struggle with the computational and representational demands of longer audio sequences. To address this limitation, we propose SpeechPrune, a training-free token pruning strategy that uses speech-text similarity and approximated attention scores to efficiently discard irrelevant tokens. In SPIRAL, SpeechPrune achieves accuracy improvements of 29% and up to 47% over the original model and the random pruning model at a pruning rate of 20%, respectively. SpeechPrune can maintain network performance even at a pruning level of 80%. This approach highlights the potential of token-level pruning for efficient and scalable long-form speech understanding. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.12009v1-abstract-full').style.display = 'none'; document.getElementById('2412.12009v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page and dataset is available at https://speechprune.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.11551">arXiv:2412.11551</a> <span> [<a href="https://arxiv.org/pdf/2412.11551">pdf</a>, <a href="https://arxiv.org/format/2412.11551">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Region-Based Optimization in Continual Learning for Audio Deepfake Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yujie Chen</a>, <a href="/search/eess?searchtype=author&query=Yi%2C+J">Jiangyan Yi</a>, <a href="/search/eess?searchtype=author&query=Fan%2C+C">Cunhang Fan</a>, <a href="/search/eess?searchtype=author&query=Tao%2C+J">Jianhua Tao</a>, <a href="/search/eess?searchtype=author&query=Ren%2C+Y">Yong Ren</a>, <a href="/search/eess?searchtype=author&query=Zeng%2C+S">Siding Zeng</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+C+Y">Chu Yuan Zhang</a>, <a href="/search/eess?searchtype=author&query=Yan%2C+X">Xinrui Yan</a>, <a href="/search/eess?searchtype=author&query=Gu%2C+H">Hao Gu</a>, <a href="/search/eess?searchtype=author&query=Xue%2C+J">Jun Xue</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+C">Chenglong Wang</a>, <a href="/search/eess?searchtype=author&query=Lv%2C+Z">Zhao Lv</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+X">Xiaohui Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.11551v1-abstract-short" style="display: inline;"> Rapid advancements in speech synthesis and voice conversion bring convenience but also new security risks, creating an urgent need for effective audio deepfake detection. Although current models perform well, their effectiveness diminishes when confronted with the diverse and evolving nature of real-world deepfakes. To address this issue, we propose a continual learning method named Region-Based O… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.11551v1-abstract-full').style.display = 'inline'; document.getElementById('2412.11551v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.11551v1-abstract-full" style="display: none;"> Rapid advancements in speech synthesis and voice conversion bring convenience but also new security risks, creating an urgent need for effective audio deepfake detection. Although current models perform well, their effectiveness diminishes when confronted with the diverse and evolving nature of real-world deepfakes. To address this issue, we propose a continual learning method named Region-Based Optimization (RegO) for audio deepfake detection. Specifically, we use the Fisher information matrix to measure important neuron regions for real and fake audio detection, dividing them into four regions. First, we directly fine-tune the less important regions to quickly adapt to new tasks. Next, we apply gradient optimization in parallel for regions important only to real audio detection, and in orthogonal directions for regions important only to fake audio detection. For regions that are important to both, we use sample proportion-based adaptive gradient optimization. This region-adaptive optimization ensures an appropriate trade-off between memory stability and learning plasticity. Additionally, to address the increase of redundant neurons from old tasks, we further introduce the Ebbinghaus forgetting mechanism to release them, thereby promoting the capability of the model to learn more generalized discriminative features. Experimental results show our method achieves a 21.3% improvement in EER over the state-of-the-art continual learning approach RWM for audio deepfake detection. Moreover, the effectiveness of RegO extends beyond the audio deepfake detection domain, showing potential significance in other tasks, such as image recognition. The code is available at https://github.com/cyjie429/RegO <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.11551v1-abstract-full').style.display = 'none'; document.getElementById('2412.11551v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by AAAI 2025</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Chen%2C+Y&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Chen%2C+Y&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Chen%2C+Y&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Chen%2C+Y&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Chen%2C+Y&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Chen%2C+Y&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>