Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 772 results for author: <span class="mathjax">Zhang, S</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/eess" aria-role="search"> Searching in archive <strong>eess</strong>. <a href="/search/?searchtype=author&query=Zhang%2C+S">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Zhang, S"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Zhang%2C+S&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Zhang, S"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Zhang%2C+S&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Zhang%2C+S&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+S&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+S&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+S&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+S&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.14534">arXiv:2502.14534</a> <span> [<a href="https://arxiv.org/pdf/2502.14534">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Poststroke rehabilitative mechanisms in individualized fatigue level-controlled treadmill training -- a Rat Model Study </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Xu%2C+Y">Yuchen Xu</a>, <a href="/search/eess?searchtype=author&query=Peng%2C+Y">Yulong Peng</a>, <a href="/search/eess?searchtype=author&query=Yao%2C+Y">Yuanfa Yao</a>, <a href="/search/eess?searchtype=author&query=Fan%2C+X">Xiaoman Fan</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+M">Minmin Wang</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+F">Feng Gao</a>, <a href="/search/eess?searchtype=author&query=Sawan%2C+M">Mohamad Sawan</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+S">Shaomin Zhang</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+X">Xiaoling Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.14534v1-abstract-short" style="display: inline;"> Individualized training improved post-stroke motor function rehabilitation efficiency. However, the mechanisms of how individualized training facilitates recovery is not clear. This study explored the cortical and corticomuscular rehabilitative effects in post-stroke motor function recovery during individualized training. Sprague-Dawley rats with intracerebral hemorrhage (ICH) were randomly distri… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14534v1-abstract-full').style.display = 'inline'; document.getElementById('2502.14534v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.14534v1-abstract-full" style="display: none;"> Individualized training improved post-stroke motor function rehabilitation efficiency. However, the mechanisms of how individualized training facilitates recovery is not clear. This study explored the cortical and corticomuscular rehabilitative effects in post-stroke motor function recovery during individualized training. Sprague-Dawley rats with intracerebral hemorrhage (ICH) were randomly distributed into two groups: forced training (FOR-T, n=13) and individualized fatigue-controlled training (FAT-C, n=13) to receive training respectively from day 2 to day 14 post-stroke. The FAT-C group exhibited superior motor function recovery and less central fatigue compared to the FOR-T group. EEG PSD slope analysis demonstrated a better inter-hemispheric balance in FAT-C group compare to the FOR-T group. The dCMC analysis indicated that training-induced fatigue led to a short-term down-regulation of descending corticomuscular coherence (dCMC) and an up-regulation of ascending dCMC. In the long term, excessive fatigue hindered the recovery of descending control in the affected hemisphere. The individualized strategy of peripheral fatigue-controlled training achieved better motor function recovery, which could be attributed to the mitigation of central fatigue, optimization of inter-hemispheric balance and enhancement of descending control in the affected hemisphere. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14534v1-abstract-full').style.display = 'none'; document.getElementById('2502.14534v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.12686">arXiv:2502.12686</a> <span> [<a href="https://arxiv.org/pdf/2502.12686">pdf</a>, <a href="https://arxiv.org/format/2502.12686">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> RadSplatter: Extending 3D Gaussian Splatting to Radio Frequencies for Wireless Radiomap Extrapolation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yiheng Wang</a>, <a href="/search/eess?searchtype=author&query=Xue%2C+Y">Ye Xue</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+S">Shutao Zhang</a>, <a href="/search/eess?searchtype=author&query=Chang%2C+T">Tsung-Hui Chang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.12686v1-abstract-short" style="display: inline;"> A radiomap represents the spatial distribution of wireless signal strength, critical for applications like network optimization and autonomous driving. However, constructing radiomap relies on measuring radio signal power across the entire system, which is costly in outdoor environments due to large network scales. We present RadSplatter, a framework that extends 3D Gaussian Splatting (3DGS) to ra… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12686v1-abstract-full').style.display = 'inline'; document.getElementById('2502.12686v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.12686v1-abstract-full" style="display: none;"> A radiomap represents the spatial distribution of wireless signal strength, critical for applications like network optimization and autonomous driving. However, constructing radiomap relies on measuring radio signal power across the entire system, which is costly in outdoor environments due to large network scales. We present RadSplatter, a framework that extends 3D Gaussian Splatting (3DGS) to radio frequencies for efficient and accurate radiomap extrapolation from sparse measurements. RadSplatter models environmental scatterers and radio paths using 3D Gaussians, capturing key factors of radio wave propagation. It employs a relaxed-mean (RM) scheme to reparameterize the positions of 3D Gaussians from noisy and dense 3D point clouds. A camera-free 3DGS-based projection is proposed to map 3D Gaussians onto 2D radio beam patterns. Furthermore, a regularized loss function and recursive fine-tuning using highly structured sparse measurements in real-world settings are applied to ensure robust generalization. Experiments on synthetic and real-world data show state-of-the-art extrapolation accuracy and execution speed. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12686v1-abstract-full').style.display = 'none'; document.getElementById('2502.12686v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11484">arXiv:2502.11484</a> <span> [<a href="https://arxiv.org/pdf/2502.11484">pdf</a>, <a href="https://arxiv.org/format/2502.11484">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Dictionary-Learning-Based Data Pruning for System Identification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+T">Tingna Wang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+S">Sikai Zhang</a>, <a href="/search/eess?searchtype=author&query=Sun%2C+L">Limin Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11484v1-abstract-short" style="display: inline;"> System identification is normally involved in augmenting time series data by time shifting and nonlinearisation (via polynomial basis), which introduce redundancy both feature-wise and sample-wise. Many research works focus on reducing redundancy feature-wise, while less attention is paid to sample-wise redundancy. This paper proposes a novel data pruning method, called (mini-batch) FastCan, to re… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11484v1-abstract-full').style.display = 'inline'; document.getElementById('2502.11484v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11484v1-abstract-full" style="display: none;"> System identification is normally involved in augmenting time series data by time shifting and nonlinearisation (via polynomial basis), which introduce redundancy both feature-wise and sample-wise. Many research works focus on reducing redundancy feature-wise, while less attention is paid to sample-wise redundancy. This paper proposes a novel data pruning method, called (mini-batch) FastCan, to reduce sample-wise redundancy based on dictionary learning. Time series data is represented by some representative samples, called atoms, via dictionary learning. The useful samples are selected based on their correlation with the atoms. The method is tested on one simulated dataset and two benchmark datasets. The R-squared between the coefficients of models trained on the full and the coefficients of models trained on pruned datasets is adopted to evaluate the performance of data pruning methods. It is found that the proposed method significantly outperforms the random pruning method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11484v1-abstract-full').style.display = 'none'; document.getElementById('2502.11484v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09283">arXiv:2502.09283</a> <span> [<a href="https://arxiv.org/pdf/2502.09283">pdf</a>, <a href="https://arxiv.org/format/2502.09283">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Rate-Splitting Multiple Access for 6G: Prototypes, Experimental Results and Link/System level Simulations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Aditya%2C+S">Sundar Aditya</a>, <a href="/search/eess?searchtype=author&query=Kim%2C+Y+J+D">Yong Jin Daniel Kim</a>, <a href="/search/eess?searchtype=author&query=Vargas%2C+D">David Vargas</a>, <a href="/search/eess?searchtype=author&query=Redgate%2C+D">David Redgate</a>, <a href="/search/eess?searchtype=author&query=Dizdar%2C+O">Onur Dizdar</a>, <a href="/search/eess?searchtype=author&query=Bhushan%2C+N">Neil Bhushan</a>, <a href="/search/eess?searchtype=author&query=Lyu%2C+X">Xinze Lyu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+S">Sibo Zhang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+S">Stephen Wang</a>, <a href="/search/eess?searchtype=author&query=Clerckx%2C+B">Bruno Clerckx</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09283v2-abstract-short" style="display: inline;"> Rate-Splitting Multiple Access (RSMA) is a powerful and versatile physical layer multiple access technique that generalizes and has better interference management capabilities than 5G-based Space Division Multiple Access (SDMA). It is also a rapidly maturing technology, all of which makes it a natural successor to SDMA in 6G. In this article, we describe RSMA's suitability for 6G by presenting: i)… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09283v2-abstract-full').style.display = 'inline'; document.getElementById('2502.09283v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09283v2-abstract-full" style="display: none;"> Rate-Splitting Multiple Access (RSMA) is a powerful and versatile physical layer multiple access technique that generalizes and has better interference management capabilities than 5G-based Space Division Multiple Access (SDMA). It is also a rapidly maturing technology, all of which makes it a natural successor to SDMA in 6G. In this article, we describe RSMA's suitability for 6G by presenting: i) link and system level simulations of RSMA's performance gains over SDMA in realistic environments, and (ii) pioneering experimental results that demonstrate RSMA's gains over SDMA for key use cases like enhanced Mobile Broadband (eMBb), and Integrated Sensing and Communications (ISAC). We also comment on the status of standardization activities for RSMA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09283v2-abstract-full').style.display = 'none'; document.getElementById('2502.09283v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to the IEEE Communications Standards Magazine December 2025 Special Issue on "Wireless Technologies for 6G and Beyond: Applications, Implementations, and Standardization"</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06171">arXiv:2502.06171</a> <span> [<a href="https://arxiv.org/pdf/2502.06171">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> A Data-Efficient Pan-Tumor Foundation Model for Oncology CT Interpretation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Lei%2C+W">Wenhui Lei</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+H">Hanyu Chen</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Z">Zitian Zhang</a>, <a href="/search/eess?searchtype=author&query=Luo%2C+L">Luyang Luo</a>, <a href="/search/eess?searchtype=author&query=Xiao%2C+Q">Qiong Xiao</a>, <a href="/search/eess?searchtype=author&query=Gu%2C+Y">Yannian Gu</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+P">Peng Gao</a>, <a href="/search/eess?searchtype=author&query=Jiang%2C+Y">Yankai Jiang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+C">Ci Wang</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+G">Guangtao Wu</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+T">Tongjia Xu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Y">Yingjie Zhang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+X">Xiaofan Zhang</a>, <a href="/search/eess?searchtype=author&query=Rajpurkar%2C+P">Pranav Rajpurkar</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+S">Shaoting Zhang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Z">Zhenning Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06171v1-abstract-short" style="display: inline;"> Artificial intelligence-assisted imaging analysis has made substantial strides in tumor diagnosis and management. Here we present PASTA, a pan-tumor CT foundation model that achieves state-of-the-art performance on 45 of 46 representative oncology tasks -- including lesion segmentation, tumor detection in plain CT, tumor staging, survival prediction, structured report generation, and cross-modalit… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06171v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06171v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06171v1-abstract-full" style="display: none;"> Artificial intelligence-assisted imaging analysis has made substantial strides in tumor diagnosis and management. Here we present PASTA, a pan-tumor CT foundation model that achieves state-of-the-art performance on 45 of 46 representative oncology tasks -- including lesion segmentation, tumor detection in plain CT, tumor staging, survival prediction, structured report generation, and cross-modality transfer learning, significantly outperforming the second-best models on 35 tasks. This remarkable advancement is driven by our development of PASTA-Gen, an innovative synthetic tumor generation framework that produces a comprehensive dataset of 30,000 CT scans with pixel-level annotated lesions and paired structured reports, encompassing malignancies across ten organs and five benign lesion types. By leveraging this rich, high-quality synthetic data, we overcome a longstanding bottleneck in the development of CT foundation models -- specifically, the scarcity of publicly available, high-quality annotated datasets due to privacy constraints and the substantial labor required for scaling precise data annotation. Encouragingly, PASTA demonstrates exceptional data efficiency with promising practical value, markedly improving performance on various tasks with only a small amount of real-world data. The open release of both the synthetic dataset and PASTA foundation model effectively addresses the challenge of data scarcity, thereby advancing oncological research and clinical translation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06171v1-abstract-full').style.display = 'none'; document.getElementById('2502.06171v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">57 pages, 7 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.04369">arXiv:2502.04369</a> <span> [<a href="https://arxiv.org/pdf/2502.04369">pdf</a>, <a href="https://arxiv.org/format/2502.04369">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> HSI: A Holistic Style Injector for Arbitrary Style Transfer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+S">Shuhao Zhang</a>, <a href="/search/eess?searchtype=author&query=Kang%2C+H">Hui Kang</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+Y">Yang Liu</a>, <a href="/search/eess?searchtype=author&query=Mei%2C+F">Fang Mei</a>, <a href="/search/eess?searchtype=author&query=Li%2C+H">Hongjuan Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.04369v1-abstract-short" style="display: inline;"> Attention-based arbitrary style transfer methods have gained significant attention recently due to their impressive ability to synthesize style details. However, the point-wise matching within the attention mechanism may overly focus on local patterns such that neglect the remarkable global features of style images. Additionally, when processing large images, the quadratic complexity of the attent… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04369v1-abstract-full').style.display = 'inline'; document.getElementById('2502.04369v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.04369v1-abstract-full" style="display: none;"> Attention-based arbitrary style transfer methods have gained significant attention recently due to their impressive ability to synthesize style details. However, the point-wise matching within the attention mechanism may overly focus on local patterns such that neglect the remarkable global features of style images. Additionally, when processing large images, the quadratic complexity of the attention mechanism will bring high computational load. To alleviate above problems, we propose Holistic Style Injector (HSI), a novel attention-style transformation module to deliver artistic expression of target style. Specifically, HSI performs stylization only based on global style representation that is more in line with the characteristics of style transfer, to avoid generating local disharmonious patterns in stylized images. Moreover, we propose a dual relation learning mechanism inside the HSI to dynamically render images by leveraging semantic similarity in content and style, ensuring the stylized images preserve the original content and improve style fidelity. Note that the proposed HSI achieves linear computational complexity because it establishes feature mapping through element-wise multiplication rather than matrix multiplication. Qualitative and quantitative results demonstrate that our method outperforms state-of-the-art approaches in both effectiveness and efficiency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04369v1-abstract-full').style.display = 'none'; document.getElementById('2502.04369v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.03736">arXiv:2502.03736</a> <span> [<a href="https://arxiv.org/pdf/2502.03736">pdf</a>, <a href="https://arxiv.org/format/2502.03736">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Decoding Human Attentive States from Spatial-temporal EEG Patches Using Transformers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ding%2C+Y">Yi Ding</a>, <a href="/search/eess?searchtype=author&query=Lee%2C+J+H">Joon Hei Lee</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+S">Shuailei Zhang</a>, <a href="/search/eess?searchtype=author&query=Luo%2C+T">Tianze Luo</a>, <a href="/search/eess?searchtype=author&query=Guan%2C+C">Cuntai Guan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.03736v2-abstract-short" style="display: inline;"> Learning the spatial topology of electroencephalogram (EEG) channels and their temporal dynamics is crucial for decoding attention states. This paper introduces EEG-PatchFormer, a transformer-based deep learning framework designed specifically for EEG attention classification in Brain-Computer Interface (BCI) applications. By integrating a Temporal CNN for frequency-based EEG feature extraction, a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03736v2-abstract-full').style.display = 'inline'; document.getElementById('2502.03736v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.03736v2-abstract-full" style="display: none;"> Learning the spatial topology of electroencephalogram (EEG) channels and their temporal dynamics is crucial for decoding attention states. This paper introduces EEG-PatchFormer, a transformer-based deep learning framework designed specifically for EEG attention classification in Brain-Computer Interface (BCI) applications. By integrating a Temporal CNN for frequency-based EEG feature extraction, a pointwise CNN for feature enhancement, and Spatial and Temporal Patching modules for organizing features into spatial-temporal patches, EEG-PatchFormer jointly learns spatial-temporal information from EEG data. Leveraging the global learning capabilities of the self-attention mechanism, it captures essential features across brain regions over time, thereby enhancing EEG data decoding performance. Demonstrating superior performance, EEG-PatchFormer surpasses existing benchmarks in accuracy, area under the ROC curve (AUC), and macro-F1 score on a public cognitive attention dataset. The code can be found via: https://github.com/yi-ding-cs/EEG-PatchFormer . <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03736v2-abstract-full').style.display = 'none'; document.getElementById('2502.03736v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Implementation details are updated</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.02295">arXiv:2502.02295</a> <span> [<a href="https://arxiv.org/pdf/2502.02295">pdf</a>, <a href="https://arxiv.org/ps/2502.02295">ps</a>, <a href="https://arxiv.org/format/2502.02295">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> Intelligent Reflecting Surface Based Localization of Mixed Near-Field and Far-Field Targets </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhu%2C+W">Weifeng Zhu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Q">Qipeng Wang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+S">Shuowen Zhang</a>, <a href="/search/eess?searchtype=author&query=Di%2C+B">Boya Di</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+L">Liang Liu</a>, <a href="/search/eess?searchtype=author&query=Eldar%2C+Y+C">Yonina C. Eldar</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.02295v1-abstract-short" style="display: inline;"> This paper considers an intelligent reflecting surface (IRS)-assisted bi-static localization architecture for the sixth-generation (6G) integrated sensing and communication (ISAC) network. The system consists of a transmit user, a receive base station (BS), an IRS, and multiple targets in either the far-field or near-field region of the IRS. In particular, we focus on the challenging scenario wher… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02295v1-abstract-full').style.display = 'inline'; document.getElementById('2502.02295v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.02295v1-abstract-full" style="display: none;"> This paper considers an intelligent reflecting surface (IRS)-assisted bi-static localization architecture for the sixth-generation (6G) integrated sensing and communication (ISAC) network. The system consists of a transmit user, a receive base station (BS), an IRS, and multiple targets in either the far-field or near-field region of the IRS. In particular, we focus on the challenging scenario where the line-of-sight (LOS) paths between targets and the BS are blocked, such that the emitted orthogonal frequency division multiplexing (OFDM) signals from the user reach the BS merely via the user-target-IRS-BS path. Based on the signals received by the BS, our goal is to localize the targets by estimating their relative positions to the IRS, instead of to the BS. We show that subspace-based methods, such as the multiple signal classification (MUSIC) algorithm, can be applied onto the BS's received signals to estimate the relative states from the targets to the IRS. To this end, we create a virtual signal via combining user-target-IRS-BS channels over various time slots. By applying MUSIC on such a virtual signal, we are able to detect the far-field targets and the near-field targets, and estimate the angle-of-arrivals (AOAs) and/or ranges from the targets to the IRS. Furthermore, we theoretically verify that the proposed method can perfectly estimate the relative states from the targets to the IRS in the ideal case with infinite coherence blocks. Numerical results verify the effectiveness of our proposed IRS-assisted localization scheme. Our paper demonstrates the potential of employing passive anchors, i.e., IRSs, to improve the sensing coverage of the active anchors, i.e., BSs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02295v1-abstract-full').style.display = 'none'; document.getElementById('2502.02295v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.00319">arXiv:2502.00319</a> <span> [<a href="https://arxiv.org/pdf/2502.00319">pdf</a>, <a href="https://arxiv.org/format/2502.00319">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Physics-Inspired Distributed Radio Map Estimation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yang%2C+D">Dong Yang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yue Wang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+S">Songyang Zhang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yingshu Li</a>, <a href="/search/eess?searchtype=author&query=Cai%2C+Z">Zhipeng Cai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.00319v1-abstract-short" style="display: inline;"> To gain panoramic awareness of spectrum coverage in complex wireless environments, data-driven learning approaches have recently been introduced for radio map estimation (RME). While existing deep learning based methods conduct RME given spectrum measurements gathered from dispersed sensors in the region of interest, they rely on centralized data at a fusion center, which however raises critical c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00319v1-abstract-full').style.display = 'inline'; document.getElementById('2502.00319v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.00319v1-abstract-full" style="display: none;"> To gain panoramic awareness of spectrum coverage in complex wireless environments, data-driven learning approaches have recently been introduced for radio map estimation (RME). While existing deep learning based methods conduct RME given spectrum measurements gathered from dispersed sensors in the region of interest, they rely on centralized data at a fusion center, which however raises critical concerns on data privacy leakages and high communication overloads. Federated learning (FL) enhance data security and communication efficiency in RME by allowing multiple clients to collaborate in model training without directly sharing local data. However, the performance of the FL-based RME can be hindered by the problem of task heterogeneity across clients due to their unavailable or inaccurate landscaping information. To fill this gap, in this paper, we propose a physics-inspired distributed RME solution in the absence of landscaping information. The main idea is to develop a novel distributed RME framework empowered by leveraging the domain knowledge of radio propagation models, and by designing a new distributed learning approach that splits the entire RME model into two modules. A global autoencoder module is shared among clients to capture the common pathloss influence on radio propagation pattern, while a client-specific autoencoder module focuses on learning the individual features produced by local shadowing effects from the unique building distributions in local environment. Simulation results show that our proposed method outperforms the benchmarks in achieving higher performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00319v1-abstract-full').style.display = 'none'; document.getElementById('2502.00319v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.00295">arXiv:2502.00295</a> <span> [<a href="https://arxiv.org/pdf/2502.00295">pdf</a>, <a href="https://arxiv.org/format/2502.00295">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Toward noise-robust whisper keyword spotting on headphones with in-earcup microphone and curriculum learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yang%2C+Q">Qiaoyu Yang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+S">Shuo Zhang</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+C">Chuan-Che Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.00295v1-abstract-short" style="display: inline;"> The expanding feature set of modern headphones puts a challenge on the design of their control interface. Users may want to separately control each feature or quickly switch between modes that activate different features. Traditional approach of physical buttons may no longer be feasible when the feature set is large. Keyword spotting with voice commands is a promising solution to the issue. Most… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00295v1-abstract-full').style.display = 'inline'; document.getElementById('2502.00295v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.00295v1-abstract-full" style="display: none;"> The expanding feature set of modern headphones puts a challenge on the design of their control interface. Users may want to separately control each feature or quickly switch between modes that activate different features. Traditional approach of physical buttons may no longer be feasible when the feature set is large. Keyword spotting with voice commands is a promising solution to the issue. Most existing methods of keyword spotting only support commands spoken in a regular voice. However, regular voice may not be desirable in quiet places or public settings. In this paper, we investigate the problem of on-device keyword spotting in whisper voice and explore approaches to improve noise robustness. We leverage the inner microphone on noise-cancellation headphones as an additional source of voice input. We also design a curriculum learning strategy that gradually increases the proportion of whisper keywords during training. We demonstrate through experiments that the combination of multi-microphone processing and curriculum learning could improve F1 score of whisper keyword spotting by up to 15% in noisy conditions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00295v1-abstract-full').style.display = 'none'; document.getElementById('2502.00295v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ICASSP 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.17644">arXiv:2501.17644</a> <span> [<a href="https://arxiv.org/pdf/2501.17644">pdf</a>, <a href="https://arxiv.org/format/2501.17644">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Efficient Stochastic Polar Decoder With Correlated Stochastic Computing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+J">Jiaxing Li</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+S">Shuwen Zhang</a>, <a href="/search/eess?searchtype=author&query=Bie%2C+Z">Zhisong Bie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.17644v1-abstract-short" style="display: inline;"> Polar codes have gained significant attention in channel coding for their ability to approach the capacity of binary input discrete memoryless channels (B-DMCs), thanks to their reliability and efficiency in transmission. However, existing decoders often struggle to balance hardware area and performance. Stochastic computing offers a way to simplify circuits, and previous work has implemented deco… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.17644v1-abstract-full').style.display = 'inline'; document.getElementById('2501.17644v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.17644v1-abstract-full" style="display: none;"> Polar codes have gained significant attention in channel coding for their ability to approach the capacity of binary input discrete memoryless channels (B-DMCs), thanks to their reliability and efficiency in transmission. However, existing decoders often struggle to balance hardware area and performance. Stochastic computing offers a way to simplify circuits, and previous work has implemented decoding using this approach. A common issue with these methods is performance degradation caused by the introduction of correlation. This paper presents an Efficient Correlated Stochastic Polar Decoder (ECS-PD) that fundamentally addresses the issue of the `hold-state', preventing it from increasing as correlation computation progresses. We propose two optimization strategies aimed at reducing iteration latency, increasing throughput, and simplifying the circuit to improve hardware efficiency. The optimization can reduce the number of iterations by 25.2% at $E_b/N_0$ = 3 dB. Compared to other efficient designs, the proposed ECS-PD achieves higher throughput and is 2.7 times more hardware-efficient than the min-sum decoder. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.17644v1-abstract-full').style.display = 'none'; document.getElementById('2501.17644v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.15368">arXiv:2501.15368</a> <span> [<a href="https://arxiv.org/pdf/2501.15368">pdf</a>, <a href="https://arxiv.org/format/2501.15368">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Baichuan-Omni-1.5 Technical Report </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yadong Li</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+J">Jun Liu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+T">Tao Zhang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+T">Tao Zhang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+S">Song Chen</a>, <a href="/search/eess?searchtype=author&query=Li%2C+T">Tianpeng Li</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Z">Zehuan Li</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+L">Lijun Liu</a>, <a href="/search/eess?searchtype=author&query=Ming%2C+L">Lingfeng Ming</a>, <a href="/search/eess?searchtype=author&query=Dong%2C+G">Guosheng Dong</a>, <a href="/search/eess?searchtype=author&query=Pan%2C+D">Da Pan</a>, <a href="/search/eess?searchtype=author&query=Li%2C+C">Chong Li</a>, <a href="/search/eess?searchtype=author&query=Fang%2C+Y">Yuanbo Fang</a>, <a href="/search/eess?searchtype=author&query=Kuang%2C+D">Dongdong Kuang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+M">Mingrui Wang</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+C">Chenglin Zhu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Y">Youwei Zhang</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+H">Hongyu Guo</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+F">Fengyu Zhang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yuran Wang</a>, <a href="/search/eess?searchtype=author&query=Ding%2C+B">Bowen Ding</a>, <a href="/search/eess?searchtype=author&query=Song%2C+W">Wei Song</a>, <a href="/search/eess?searchtype=author&query=Li%2C+X">Xu Li</a>, <a href="/search/eess?searchtype=author&query=Huo%2C+Y">Yuqi Huo</a>, <a href="/search/eess?searchtype=author&query=Liang%2C+Z">Zheng Liang</a> , et al. (68 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.15368v1-abstract-short" style="display: inline;"> We introduce Baichuan-Omni-1.5, an omni-modal model that not only has omni-modal understanding capabilities but also provides end-to-end audio generation capabilities. To achieve fluent and high-quality interaction across modalities without compromising the capabilities of any modality, we prioritized optimizing three key aspects. First, we establish a comprehensive data cleaning and synthesis pip… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15368v1-abstract-full').style.display = 'inline'; document.getElementById('2501.15368v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.15368v1-abstract-full" style="display: none;"> We introduce Baichuan-Omni-1.5, an omni-modal model that not only has omni-modal understanding capabilities but also provides end-to-end audio generation capabilities. To achieve fluent and high-quality interaction across modalities without compromising the capabilities of any modality, we prioritized optimizing three key aspects. First, we establish a comprehensive data cleaning and synthesis pipeline for multimodal data, obtaining about 500B high-quality data (text, audio, and vision). Second, an audio-tokenizer (Baichuan-Audio-Tokenizer) has been designed to capture both semantic and acoustic information from audio, enabling seamless integration and enhanced compatibility with MLLM. Lastly, we designed a multi-stage training strategy that progressively integrates multimodal alignment and multitask fine-tuning, ensuring effective synergy across all modalities. Baichuan-Omni-1.5 leads contemporary models (including GPT4o-mini and MiniCPM-o 2.6) in terms of comprehensive omni-modal capabilities. Notably, it achieves results comparable to leading models such as Qwen2-VL-72B across various multimodal medical benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15368v1-abstract-full').style.display = 'none'; document.getElementById('2501.15368v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.14526">arXiv:2501.14526</a> <span> [<a href="https://arxiv.org/pdf/2501.14526">pdf</a>, <a href="https://arxiv.org/format/2501.14526">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Robustified Time-optimal Point-to-point Motion Planning and Control under Uncertainty </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+S">Shuhao Zhang</a>, <a href="/search/eess?searchtype=author&query=Swevers%2C+J">Jan Swevers</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.14526v1-abstract-short" style="display: inline;"> This paper proposes a novel approach to formulate time-optimal point-to-point motion planning and control under uncertainty. The approach defines a robustified two-stage Optimal Control Problem (OCP), in which stage 1, with a fixed time grid, is seamlessly stitched with stage 2, which features a variable time grid. Stage 1 optimizes not only the nominal trajectory, but also feedback gains and corr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14526v1-abstract-full').style.display = 'inline'; document.getElementById('2501.14526v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.14526v1-abstract-full" style="display: none;"> This paper proposes a novel approach to formulate time-optimal point-to-point motion planning and control under uncertainty. The approach defines a robustified two-stage Optimal Control Problem (OCP), in which stage 1, with a fixed time grid, is seamlessly stitched with stage 2, which features a variable time grid. Stage 1 optimizes not only the nominal trajectory, but also feedback gains and corresponding state covariances, which robustify constraints in both stages. The outcome is a minimized uncertainty in stage 1 and a minimized total motion time for stage 2, both contributing to the time optimality and safety of the total motion. A timely replanning strategy is employed to handle changes in constraints and maintain feasibility, while a tailored iterative algorithm is proposed for efficient, real-time OCP execution. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14526v1-abstract-full').style.display = 'none'; document.getElementById('2501.14526v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.12501">arXiv:2501.12501</a> <span> [<a href="https://arxiv.org/pdf/2501.12501">pdf</a>, <a href="https://arxiv.org/format/2501.12501">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> A Domain Adaptation Framework for Speech Recognition Systems with Only Synthetic data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Tran%2C+M">Minh Tran</a>, <a href="/search/eess?searchtype=author&query=Pang%2C+Y">Yutong Pang</a>, <a href="/search/eess?searchtype=author&query=Paul%2C+D">Debjyoti Paul</a>, <a href="/search/eess?searchtype=author&query=Pandey%2C+L">Laxmi Pandey</a>, <a href="/search/eess?searchtype=author&query=Jiang%2C+K">Kevin Jiang</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+J">Jinxi Guo</a>, <a href="/search/eess?searchtype=author&query=Li%2C+K">Ke Li</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+S">Shun Zhang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+X">Xuedong Zhang</a>, <a href="/search/eess?searchtype=author&query=Lei%2C+X">Xin Lei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.12501v1-abstract-short" style="display: inline;"> We introduce DAS (Domain Adaptation with Synthetic data), a novel domain adaptation framework for pre-trained ASR model, designed to efficiently adapt to various language-defined domains without requiring any real data. In particular, DAS first prompts large language models (LLMs) to generate domain-specific texts before converting these texts to speech via text-to-speech technology. The synthetic… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12501v1-abstract-full').style.display = 'inline'; document.getElementById('2501.12501v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.12501v1-abstract-full" style="display: none;"> We introduce DAS (Domain Adaptation with Synthetic data), a novel domain adaptation framework for pre-trained ASR model, designed to efficiently adapt to various language-defined domains without requiring any real data. In particular, DAS first prompts large language models (LLMs) to generate domain-specific texts before converting these texts to speech via text-to-speech technology. The synthetic data is used to fine-tune Whisper with Low-Rank Adapters (LoRAs) for targeted domains such as music, weather, and sports. We introduce a novel one-pass decoding strategy that merges predictions from multiple LoRA adapters efficiently during the auto-regressive text generation process. Experimental results show significant improvements, reducing the Word Error Rate (WER) by 10% to 17% across all target domains compared to the original model, with minimal performance regression in out-of-domain settings (e.g., -1% on Librispeech test sets). We also demonstrate that DAS operates efficiently during inference, introducing an additional 9% increase in Real Time Factor (RTF) compared to the original model when inferring with three LoRA adapters. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12501v1-abstract-full').style.display = 'none'; document.getElementById('2501.12501v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICASSP 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.10654">arXiv:2501.10654</a> <span> [<a href="https://arxiv.org/pdf/2501.10654">pdf</a>, <a href="https://arxiv.org/format/2501.10654">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Efficient Transmission of Radiomaps via Physics-Enhanced Semantic Communications </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhou%2C+Y">Yueling Zhou</a>, <a href="/search/eess?searchtype=author&query=Wijesinghe%2C+A">Achintha Wijesinghe</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yue Wang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+S">Songyang Zhang</a>, <a href="/search/eess?searchtype=author&query=Cai%2C+Z">Zhipeng Cai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.10654v1-abstract-short" style="display: inline;"> Enriching information of spectrum coverage, radiomap plays an important role in many wireless communication applications, such as resource allocation and network optimization. To enable real-time, distributed spectrum management, particularly in the scenarios with unstable and dynamic environments, the efficient transmission of spectrum coverage information for radiomaps from edge devices to the c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10654v1-abstract-full').style.display = 'inline'; document.getElementById('2501.10654v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.10654v1-abstract-full" style="display: none;"> Enriching information of spectrum coverage, radiomap plays an important role in many wireless communication applications, such as resource allocation and network optimization. To enable real-time, distributed spectrum management, particularly in the scenarios with unstable and dynamic environments, the efficient transmission of spectrum coverage information for radiomaps from edge devices to the central server emerges as a critical problem. In this work, we propose an innovative physics-enhanced semantic communication framework tailored for efficient radiomap transmission based on generative learning models. Specifically, instead of bit-wise message passing, we only transmit the key "semantics" in radiomaps characterized by the radio propagation behavior and surrounding environments, where semantic compression schemes are utilized to reduce the communication overhead. Incorporating the novel concepts of Radio Depth Maps, the radiomaps are reconstructed from the delivered semantic information backboned on the conditional generative adversarial networks. Our framework is further extended to facilitate its implementation in the scenarios of multi-user edge computing, by integrating with federated learning for collaborative model training while preserving the data privacy. Experimental results show that our approach achieves high accuracy in radio coverage information recovery at ultra-high bandwidth efficiency, which has great potentials in many wireless-generated data transmission applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10654v1-abstract-full').style.display = 'none'; document.getElementById('2501.10654v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To appear in 2025 IEEE International Conference on Communications</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.10408">arXiv:2501.10408</a> <span> [<a href="https://arxiv.org/pdf/2501.10408">pdf</a>, <a href="https://arxiv.org/format/2501.10408">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Leveraging Cross-Attention Transformer and Multi-Feature Fusion for Cross-Linguistic Speech Emotion Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhao%2C+R">Ruoyu Zhao</a>, <a href="/search/eess?searchtype=author&query=Jiang%2C+X">Xiantao Jiang</a>, <a href="/search/eess?searchtype=author&query=Yu%2C+F+R">F. Richard Yu</a>, <a href="/search/eess?searchtype=author&query=Leung%2C+V+C+M">Victor C. M. Leung</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+T">Tao Wang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+S">Shaohu Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.10408v1-abstract-short" style="display: inline;"> Speech Emotion Recognition (SER) plays a crucial role in enhancing human-computer interaction. Cross-Linguistic SER (CLSER) has been a challenging research problem due to significant variability in linguistic and acoustic features of different languages. In this study, we propose a novel approach HuMP-CAT, which combines HuBERT, MFCC, and prosodic characteristics. These features are fused using a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10408v1-abstract-full').style.display = 'inline'; document.getElementById('2501.10408v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.10408v1-abstract-full" style="display: none;"> Speech Emotion Recognition (SER) plays a crucial role in enhancing human-computer interaction. Cross-Linguistic SER (CLSER) has been a challenging research problem due to significant variability in linguistic and acoustic features of different languages. In this study, we propose a novel approach HuMP-CAT, which combines HuBERT, MFCC, and prosodic characteristics. These features are fused using a cross-attention transformer (CAT) mechanism during feature extraction. Transfer learning is applied to gain from a source emotional speech dataset to the target corpus for emotion recognition. We use IEMOCAP as the source dataset to train the source model and evaluate the proposed method on seven datasets in five languages (e.g., English, German, Spanish, Italian, and Chinese). We show that, by fine-tuning the source model with a small portion of speech from the target datasets, HuMP-CAT achieves an average accuracy of 78.75% across the seven datasets, with notable performance of 88.69% on EMODB (German language) and 79.48% on EMOVO (Italian language). Our extensive evaluation demonstrates that HuMP-CAT outperforms existing methods across multiple target languages. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10408v1-abstract-full').style.display = 'none'; document.getElementById('2501.10408v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.10402">arXiv:2501.10402</a> <span> [<a href="https://arxiv.org/pdf/2501.10402">pdf</a>, <a href="https://arxiv.org/format/2501.10402">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> SSM2Mel: State Space Model to Reconstruct Mel Spectrogram from the EEG </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Fan%2C+C">Cunhang Fan</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+S">Sheng Zhang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+J">Jingjing Zhang</a>, <a href="/search/eess?searchtype=author&query=Pan%2C+Z">Zexu Pan</a>, <a href="/search/eess?searchtype=author&query=Lv%2C+Z">Zhao Lv</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.10402v1-abstract-short" style="display: inline;"> Decoding speech from brain signals is a challenging research problem that holds significant importance for studying speech processing in the brain. Although breakthroughs have been made in reconstructing the mel spectrograms of audio stimuli perceived by subjects at the word or letter level using noninvasive electroencephalography (EEG), there is still a critical gap in precisely reconstructing co… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10402v1-abstract-full').style.display = 'inline'; document.getElementById('2501.10402v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.10402v1-abstract-full" style="display: none;"> Decoding speech from brain signals is a challenging research problem that holds significant importance for studying speech processing in the brain. Although breakthroughs have been made in reconstructing the mel spectrograms of audio stimuli perceived by subjects at the word or letter level using noninvasive electroencephalography (EEG), there is still a critical gap in precisely reconstructing continuous speech features, especially at the minute level. To address this issue, this paper proposes a State Space Model (SSM) to reconstruct the mel spectrogram of continuous speech from EEG, named SSM2Mel. This model introduces a novel Mamba module to effectively model the long sequence of EEG signals for imagined speech. In the SSM2Mel model, the S4-UNet structure is used to enhance the extraction of local features of EEG signals, and the Embedding Strength Modulator (ESM) module is used to incorporate subject-specific information. Experimental results show that our model achieves a Pearson correlation of 0.069 on the SparrKULee dataset, which is a 38% improvement over the previous baseline. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10402v1-abstract-full').style.display = 'none'; document.getElementById('2501.10402v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICASSP 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.10392">arXiv:2501.10392</a> <span> [<a href="https://arxiv.org/pdf/2501.10392">pdf</a>, <a href="https://arxiv.org/format/2501.10392">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Ion Transmitter for Molecular Communication </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+S">Shaojie Zhang</a>, <a href="/search/eess?searchtype=author&query=Akan%2C+O+B">Ozgur B. Akan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.10392v1-abstract-short" style="display: inline;"> Molecular communication (MC) is an emerging paradigm that takes inspiration from biological processes, enabling communication at the nanoscale and facilitating the development of the Internet of Bio-Nano Things (IoBNT). Traditional models of MC often rely on idealized assumptions that overlook practical challenges related to noise and signal behavior. This paper proposes and evaluates the first ph… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10392v1-abstract-full').style.display = 'inline'; document.getElementById('2501.10392v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.10392v1-abstract-full" style="display: none;"> Molecular communication (MC) is an emerging paradigm that takes inspiration from biological processes, enabling communication at the nanoscale and facilitating the development of the Internet of Bio-Nano Things (IoBNT). Traditional models of MC often rely on idealized assumptions that overlook practical challenges related to noise and signal behavior. This paper proposes and evaluates the first physical MC ion transmitter (ITX) using an ion exchange membrane. The circuit network model is used to simulate ion transport and analyze both transient and steady-state behavior. This analysis includes the effects of noise sources such as thermal and shot noise on signal integrity and SNR. The main contributions of this paper are to demonstrate how a practical MC ITX can produce a realistic waveform and to highlight future research challenges associated with a physical membrane-based ITX. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10392v1-abstract-full').style.display = 'none'; document.getElementById('2501.10392v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 10 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.06449">arXiv:2501.06449</a> <span> [<a href="https://arxiv.org/pdf/2501.06449">pdf</a>, <a href="https://arxiv.org/ps/2501.06449">ps</a>, <a href="https://arxiv.org/format/2501.06449">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Target Detection in ISAC Systems with Active RISs: A Multi-Perspective Observation Approach </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+S">Shoushuo Zhang</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+R">Rang Liu</a>, <a href="/search/eess?searchtype=author&query=Li%2C+M">Ming Li</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+W">Wei Wang</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+Q">Qian Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.06449v1-abstract-short" style="display: inline;"> Integrated sensing and communication (ISAC) has emerged as a transformative technology for 6G networks, enabling the seamless integration of communication and sensing functionalities. Reconfigurable intelligent surfaces (RIS), with their capability to adaptively reconfigure the radio environment, have shown significant potential in enhancing communication quality and enabling advanced cooperative… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06449v1-abstract-full').style.display = 'inline'; document.getElementById('2501.06449v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.06449v1-abstract-full" style="display: none;"> Integrated sensing and communication (ISAC) has emerged as a transformative technology for 6G networks, enabling the seamless integration of communication and sensing functionalities. Reconfigurable intelligent surfaces (RIS), with their capability to adaptively reconfigure the radio environment, have shown significant potential in enhancing communication quality and enabling advanced cooperative sensing. This paper investigates a multi-RIS-assisted ISAC system and introduces a novel multi-perspective observation framework that leverages the diversity of multiple observation paths, each exhibiting distinct spatial, delay, and Doppler characteristics for both target and clutter. The proposed framework integrates symbol-level precoding (SLP) and space-time adaptive processing (STAP) to fully exploit the benefits of multi-perspective observations, enabling superior target-clutter separation and significantly improving detection accuracy. The objective is to jointly design the transmit waveform, reflection coefficients of multiple active RISs, and spatial-temporal receive filters to maximize the radar output signal-to-clutter-plus-noise ratio (SCNR) for target detection, while ensuring the quality-of-service (QoS) requirements of communication users. To address the resulting non-convex optimization problem, an effective iterative algorithm is developed, combining fractional programming (FP), majorization-minimization (MM), and the alternating direction method of multipliers (ADMM). Extensive simulation results validate the effectiveness of the proposed multi-perspective observation strategy, demonstrating its advantages in improving target detection performance in challenging environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06449v1-abstract-full').style.display = 'none'; document.getElementById('2501.06449v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to TCCN</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.06394">arXiv:2501.06394</a> <span> [<a href="https://arxiv.org/pdf/2501.06394">pdf</a>, <a href="https://arxiv.org/format/2501.06394">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Unispeaker: A Unified Approach for Multimodality-driven Speaker Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Sheng%2C+Z">Zhengyan Sheng</a>, <a href="/search/eess?searchtype=author&query=Du%2C+Z">Zhihao Du</a>, <a href="/search/eess?searchtype=author&query=Lu%2C+H">Heng Lu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+S">Shiliang Zhang</a>, <a href="/search/eess?searchtype=author&query=Ling%2C+Z">Zhen-Hua Ling</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.06394v1-abstract-short" style="display: inline;"> Recent advancements in personalized speech generation have brought synthetic speech increasingly close to the realism of target speakers' recordings, yet multimodal speaker generation remains on the rise. This paper introduces UniSpeaker, a unified approach for multimodality-driven speaker generation. Specifically, we propose a unified voice aggregator based on KV-Former, applying soft contrastive… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06394v1-abstract-full').style.display = 'inline'; document.getElementById('2501.06394v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.06394v1-abstract-full" style="display: none;"> Recent advancements in personalized speech generation have brought synthetic speech increasingly close to the realism of target speakers' recordings, yet multimodal speaker generation remains on the rise. This paper introduces UniSpeaker, a unified approach for multimodality-driven speaker generation. Specifically, we propose a unified voice aggregator based on KV-Former, applying soft contrastive loss to map diverse voice description modalities into a shared voice space, ensuring that the generated voice aligns more closely with the input descriptions. To evaluate multimodality-driven voice control, we build the first multimodality-based voice control (MVC) benchmark, focusing on voice suitability, voice diversity, and speech quality. UniSpeaker is evaluated across five tasks using the MVC benchmark, and the experimental results demonstrate that UniSpeaker outperforms previous modality-specific models. Speech samples are available at \url{https://UniSpeaker.github.io}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06394v1-abstract-full').style.display = 'none'; document.getElementById('2501.06394v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.06282">arXiv:2501.06282</a> <span> [<a href="https://arxiv.org/pdf/2501.06282">pdf</a>, <a href="https://arxiv.org/format/2501.06282">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> MinMo: A Multimodal Large Language Model for Seamless Voice Interaction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Chen%2C+Q">Qian Chen</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yafeng Chen</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yanni Chen</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+M">Mengzhe Chen</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yingda Chen</a>, <a href="/search/eess?searchtype=author&query=Deng%2C+C">Chong Deng</a>, <a href="/search/eess?searchtype=author&query=Du%2C+Z">Zhihao Du</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+R">Ruize Gao</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+C">Changfeng Gao</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+Z">Zhifu Gao</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yabin Li</a>, <a href="/search/eess?searchtype=author&query=Lv%2C+X">Xiang Lv</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+J">Jiaqing Liu</a>, <a href="/search/eess?searchtype=author&query=Luo%2C+H">Haoneng Luo</a>, <a href="/search/eess?searchtype=author&query=Ma%2C+B">Bin Ma</a>, <a href="/search/eess?searchtype=author&query=Ni%2C+C">Chongjia Ni</a>, <a href="/search/eess?searchtype=author&query=Shi%2C+X">Xian Shi</a>, <a href="/search/eess?searchtype=author&query=Tang%2C+J">Jialong Tang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+H">Hui Wang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+H">Hao Wang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+W">Wen Wang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yuxuan Wang</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+Y">Yunlan Xu</a>, <a href="/search/eess?searchtype=author&query=Yu%2C+F">Fan Yu</a>, <a href="/search/eess?searchtype=author&query=Yan%2C+Z">Zhijie Yan</a> , et al. (11 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.06282v1-abstract-short" style="display: inline;"> Recent advancements in large language models (LLMs) and multimodal speech-text models have laid the groundwork for seamless voice interactions, enabling real-time, natural, and human-like conversations. Previous models for voice interactions are categorized as native and aligned. Native models integrate speech and text processing in one framework but struggle with issues like differing sequence le… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06282v1-abstract-full').style.display = 'inline'; document.getElementById('2501.06282v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.06282v1-abstract-full" style="display: none;"> Recent advancements in large language models (LLMs) and multimodal speech-text models have laid the groundwork for seamless voice interactions, enabling real-time, natural, and human-like conversations. Previous models for voice interactions are categorized as native and aligned. Native models integrate speech and text processing in one framework but struggle with issues like differing sequence lengths and insufficient pre-training. Aligned models maintain text LLM capabilities but are often limited by small datasets and a narrow focus on speech tasks. In this work, we introduce MinMo, a Multimodal Large Language Model with approximately 8B parameters for seamless voice interaction. We address the main limitations of prior aligned multimodal models. We train MinMo through multiple stages of speech-to-text alignment, text-to-speech alignment, speech-to-speech alignment, and duplex interaction alignment, on 1.4 million hours of diverse speech data and a broad range of speech tasks. After the multi-stage training, MinMo achieves state-of-the-art performance across various benchmarks for voice comprehension and generation while maintaining the capabilities of text LLMs, and also facilitates full-duplex conversation, that is, simultaneous two-way communication between the user and the system. Moreover, we propose a novel and simple voice decoder that outperforms prior models in voice generation. The enhanced instruction-following capabilities of MinMo supports controlling speech generation based on user instructions, with various nuances including emotions, dialects, and speaking rates, and mimicking specific voices. For MinMo, the speech-to-text latency is approximately 100ms, full-duplex latency is approximately 600ms in theory and 800ms in practice. The MinMo project web page is https://funaudiollm.github.io/minmo, and the code and models will be released soon. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06282v1-abstract-full').style.display = 'none'; document.getElementById('2501.06282v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Work in progress. Authors are listed in alphabetical order by family name</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.06276">arXiv:2501.06276</a> <span> [<a href="https://arxiv.org/pdf/2501.06276">pdf</a>, <a href="https://arxiv.org/format/2501.06276">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> PROEMO: Prompt-Driven Text-to-Speech Synthesis Based on Emotion and Intensity Control </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+S">Shaozuo Zhang</a>, <a href="/search/eess?searchtype=author&query=Mehrish%2C+A">Ambuj Mehrish</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yingting Li</a>, <a href="/search/eess?searchtype=author&query=Poria%2C+S">Soujanya Poria</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.06276v1-abstract-short" style="display: inline;"> Speech synthesis has significantly advanced from statistical methods to deep neural network architectures, leading to various text-to-speech (TTS) models that closely mimic human speech patterns. However, capturing nuances such as emotion and style in speech synthesis is challenging. To address this challenge, we introduce an approach centered on prompt-based emotion control. The proposed architec… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06276v1-abstract-full').style.display = 'inline'; document.getElementById('2501.06276v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.06276v1-abstract-full" style="display: none;"> Speech synthesis has significantly advanced from statistical methods to deep neural network architectures, leading to various text-to-speech (TTS) models that closely mimic human speech patterns. However, capturing nuances such as emotion and style in speech synthesis is challenging. To address this challenge, we introduce an approach centered on prompt-based emotion control. The proposed architecture incorporates emotion and intensity control across multi-speakers. Furthermore, we leverage large language models (LLMs) to manipulate speech prosody while preserving linguistic content. Using embedding emotional cues, regulating intensity levels, and guiding prosodic variations with prompts, our approach infuses synthesized speech with human-like expressiveness and variability. Lastly, we demonstrate the effectiveness of our approach through a systematic exploration of the control mechanisms mentioned above. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06276v1-abstract-full').style.display = 'none'; document.getElementById('2501.06276v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.06051">arXiv:2501.06051</a> <span> [<a href="https://arxiv.org/pdf/2501.06051">pdf</a>, <a href="https://arxiv.org/ps/2501.06051">ps</a>, <a href="https://arxiv.org/format/2501.06051">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Benchmarking Rotary Position Embeddings for Automatic Speech Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+S">Shucong Zhang</a>, <a href="/search/eess?searchtype=author&query=Parcollet%2C+T">Titouan Parcollet</a>, <a href="/search/eess?searchtype=author&query=van+Dalen%2C+R">Rogier van Dalen</a>, <a href="/search/eess?searchtype=author&query=Bhattacharya%2C+S">Sourav Bhattacharya</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.06051v1-abstract-short" style="display: inline;"> Rotary Position Embedding (RoPE) encodes relative and absolute positional information in Transformer-based models through rotation matrices applied to input vectors within sequences. While RoPE has demonstrated superior performance compared to other positional embedding technologies in natural language processing tasks, its effectiveness in speech processing applications remains understudied. In t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06051v1-abstract-full').style.display = 'inline'; document.getElementById('2501.06051v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.06051v1-abstract-full" style="display: none;"> Rotary Position Embedding (RoPE) encodes relative and absolute positional information in Transformer-based models through rotation matrices applied to input vectors within sequences. While RoPE has demonstrated superior performance compared to other positional embedding technologies in natural language processing tasks, its effectiveness in speech processing applications remains understudied. In this work, we conduct a comprehensive evaluation of RoPE across diverse automatic speech recognition (ASR) tasks. Our experimental results demonstrate that for ASR tasks, RoPE consistently achieves lower error rates compared to the currently widely used relative positional embedding. To facilitate further research, we release the implementation and all experimental recipes through the SpeechBrain toolkit. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06051v1-abstract-full').style.display = 'none'; document.getElementById('2501.06051v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.03793">arXiv:2501.03793</a> <span> [<a href="https://arxiv.org/pdf/2501.03793">pdf</a>, <a href="https://arxiv.org/format/2501.03793">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> STAR-RIS Aided Dynamic Scatterers Tracking for Integrated Sensing and Communications </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+M">Muye Li</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+S">Shun Zhang</a>, <a href="/search/eess?searchtype=author&query=Ge%2C+Y">Yao Ge</a>, <a href="/search/eess?searchtype=author&query=Yuen%2C+C">Chau Yuen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.03793v1-abstract-short" style="display: inline;"> Integrated sensing and communication (ISAC) has become an attractive technology for future wireless networks. In this paper, we propose a simultaneous transmission and reflection reconfigurable intelligent surface (STAR-RIS) aided dynamic scatterers tracking scheme for ISAC in high mobility millimeter wave communication systems, where the STAR-RIS is employed to provide communication service for i… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03793v1-abstract-full').style.display = 'inline'; document.getElementById('2501.03793v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.03793v1-abstract-full" style="display: none;"> Integrated sensing and communication (ISAC) has become an attractive technology for future wireless networks. In this paper, we propose a simultaneous transmission and reflection reconfigurable intelligent surface (STAR-RIS) aided dynamic scatterers tracking scheme for ISAC in high mobility millimeter wave communication systems, where the STAR-RIS is employed to provide communication service for indoor user with the base station (BS) and simultaneously sense and track the interested outdoor dynamic scatterers. Specifically, we resort to an active STAR-RIS to respectively receive and further deal with the impinging signal from its double sides at the same time. Then, we develop a transmission strategy with the activation scheme of the STAR-RIS elements, and construct the signal models within the system. After acquiring the channel parameters related to the BS-RIS channel, the dynamic paths can be identified from all the scattering paths, and the dynamic targets can be classified with respect to their radar cross sections. We further track the outdoor scatterers at STAR-RIS by resorting to the Gaussian mixture-probability hypothesis density filter. With the tracked locations of the outdoor scatterers, a beam prediction strategy for both the precoder of BS and the refraction phase shift vector of STAR-RIS is developed to enhance the communication performance of the indoor user. Besides, a target mismatch detection and path collision prediction mechanism is proposed to reduce the training overhead and improve the transmission performance. Finally, the feasibility and effectiveness of our proposed STAR-RIS aided dynamic scatterers tracking scheme for ISAC are demonstrated and verified via simulation results. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03793v1-abstract-full').style.display = 'none'; document.getElementById('2501.03793v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages, 14 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.02992">arXiv:2501.02992</a> <span> [<a href="https://arxiv.org/pdf/2501.02992">pdf</a>, <a href="https://arxiv.org/format/2501.02992">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> GLFC: Unified Global-Local Feature and Contrast Learning with Mamba-Enhanced UNet for Synthetic CT Generation from CBCT </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhou%2C+X">Xianhao Zhou</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+J">Jianghao Wu</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+H">Huangxuan Zhao</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+L">Lei Chen</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+S">Shaoting Zhang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+G">Guotai Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.02992v2-abstract-short" style="display: inline;"> Generating synthetic Computed Tomography (CT) images from Cone Beam Computed Tomography (CBCT) is desirable for improving the image quality of CBCT. Existing synthetic CT (sCT) generation methods using Convolutional Neural Networks (CNN) and Transformers often face difficulties in effectively capturing both global and local features and contrasts for high-quality sCT generation. In this work, we p… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02992v2-abstract-full').style.display = 'inline'; document.getElementById('2501.02992v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.02992v2-abstract-full" style="display: none;"> Generating synthetic Computed Tomography (CT) images from Cone Beam Computed Tomography (CBCT) is desirable for improving the image quality of CBCT. Existing synthetic CT (sCT) generation methods using Convolutional Neural Networks (CNN) and Transformers often face difficulties in effectively capturing both global and local features and contrasts for high-quality sCT generation. In this work, we propose a Global-Local Feature and Contrast learning (GLFC) framework for sCT generation. First, a Mamba-Enhanced UNet (MEUNet) is introduced by integrating Mamba blocks into the skip connections of a high-resolution UNet for effective global and local feature learning. Second, we propose a Multiple Contrast Loss (MCL) that calculates synthetic loss at different intensity windows to improve quality for both soft tissues and bone regions. Experiments on the SynthRAD2023 dataset demonstrate that GLFC improved the SSIM of sCT from 77.91% to 91.50% compared with the original CBCT, and significantly outperformed several existing methods for sCT generation. The code is available at https://github.com/HiLab-git/GLFC <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02992v2-abstract-full').style.display = 'none'; document.getElementById('2501.02992v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ISBI2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.02756">arXiv:2501.02756</a> <span> [<a href="https://arxiv.org/pdf/2501.02756">pdf</a>, <a href="https://arxiv.org/format/2501.02756">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Channel Modeling and Rate Analysis of Optical Inter-Satellite Link (OISL) </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Shang%2C+B">Bodong Shang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+S">Shuo Zhang</a>, <a href="/search/eess?searchtype=author&query=Wong%2C+Z+J">Zi Jing Wong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.02756v1-abstract-short" style="display: inline;"> Optical inter-satellite links (OISLs) improve connectivity between satellites in space. They offer advantages such as high-throughput data transfer and reduced size, weight, and power requirements compared to traditional radio frequency transmission. However, the channel model and communication performance for long-distance inter-satellite laser transmission still require in-depth study. In this p… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02756v1-abstract-full').style.display = 'inline'; document.getElementById('2501.02756v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.02756v1-abstract-full" style="display: none;"> Optical inter-satellite links (OISLs) improve connectivity between satellites in space. They offer advantages such as high-throughput data transfer and reduced size, weight, and power requirements compared to traditional radio frequency transmission. However, the channel model and communication performance for long-distance inter-satellite laser transmission still require in-depth study. In this paper, we first develop a channel model for OISL communication within non-terrestrial networks (NTN) by accounting for pointing errors caused by satellite jitter and tracking noise. We derive the distributions of the channel state arising from these pointing errors and calculate their average value. Additionally, we determine the average achievable data rate for OISL communication in NTN and design a cooperative OISL system, highlighting a trade-off between concentrating beam energy and balancing misalignment. We calculate the minimum number of satellites required in cooperative OISLs to achieve a targeted data transmission size while adhering to latency constraints. This involves exploring the balance between the increased data rate of each link and the cumulative latency across all links. Finally, simulation results validate the effectiveness of the proposed analytical model and provide insights into the optimal number of satellites needed for cooperative OISLs and the optimal laser frequency to use. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02756v1-abstract-full').style.display = 'none'; document.getElementById('2501.02756v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">6 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.02572">arXiv:2501.02572</a> <span> [<a href="https://arxiv.org/pdf/2501.02572">pdf</a>, <a href="https://arxiv.org/format/2501.02572">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Energy Optimization of Multi-task DNN Inference in MEC-assisted XR Devices: A Lyapunov-Guided Reinforcement Learning Approach </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Sun%2C+Y">Yanzan Sun</a>, <a href="/search/eess?searchtype=author&query=Qiu%2C+J">Jiacheng Qiu</a>, <a href="/search/eess?searchtype=author&query=Pan%2C+G">Guangjin Pan</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+S">Shugong Xu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+S">Shunqing Zhang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+X">Xiaoyun Wang</a>, <a href="/search/eess?searchtype=author&query=Han%2C+S">Shuangfeng Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.02572v1-abstract-short" style="display: inline;"> Extended reality (XR), blending virtual and real worlds, is a key application of future networks. While AI advancements enhance XR capabilities, they also impose significant computational and energy challenges on lightweight XR devices. In this paper, we developed a distributed queue model for multi-task DNN inference, addressing issues of resource competition and queue coupling. In response to th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02572v1-abstract-full').style.display = 'inline'; document.getElementById('2501.02572v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.02572v1-abstract-full" style="display: none;"> Extended reality (XR), blending virtual and real worlds, is a key application of future networks. While AI advancements enhance XR capabilities, they also impose significant computational and energy challenges on lightweight XR devices. In this paper, we developed a distributed queue model for multi-task DNN inference, addressing issues of resource competition and queue coupling. In response to the challenges posed by the high energy consumption and limited resources of XR devices, we designed a dual time-scale joint optimization strategy for model partitioning and resource allocation, formulated as a bi-level optimization problem. This strategy aims to minimize the total energy consumption of XR devices while ensuring queue stability and adhering to computational and communication resource constraints. To tackle this problem, we devised a Lyapunov-guided Proximal Policy Optimization algorithm, named LyaPPO. Numerical results demonstrate that the LyaPPO algorithm outperforms the baselines, achieving energy conservation of 24.79% to 46.14% under varying resource capacities. Specifically, the proposed algorithm reduces the energy consumption of XR devices by 24.29% to 56.62% compared to baseline algorithms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02572v1-abstract-full').style.display = 'none'; document.getElementById('2501.02572v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 7 figures. This work has been submitted to the IEEE for possible publication</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.17839">arXiv:2412.17839</a> <span> [<a href="https://arxiv.org/pdf/2412.17839">pdf</a>, <a href="https://arxiv.org/format/2412.17839">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> LaMI-GO: Latent Mixture Integration for Goal-Oriented Communications Achieving High Spectrum Efficiency </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wijesinghe%2C+A">Achintha Wijesinghe</a>, <a href="/search/eess?searchtype=author&query=Wanninayaka%2C+S">Suchinthaka Wanninayaka</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+W">Weiwei Wang</a>, <a href="/search/eess?searchtype=author&query=Chao%2C+Y">Yu-Chieh Chao</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+S">Songyang Zhang</a>, <a href="/search/eess?searchtype=author&query=Ding%2C+Z">Zhi Ding</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.17839v1-abstract-short" style="display: inline;"> The recent rise of semantic-style communications includes the development of goal-oriented communications (GOCOMs) remarkably efficient multimedia information transmissions. The concept of GO-COMS leverages advanced artificial intelligence (AI) tools to address the rising demand for bandwidth efficiency in applications, such as edge computing and Internet-of-Things (IoT). Unlike traditional commun… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.17839v1-abstract-full').style.display = 'inline'; document.getElementById('2412.17839v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.17839v1-abstract-full" style="display: none;"> The recent rise of semantic-style communications includes the development of goal-oriented communications (GOCOMs) remarkably efficient multimedia information transmissions. The concept of GO-COMS leverages advanced artificial intelligence (AI) tools to address the rising demand for bandwidth efficiency in applications, such as edge computing and Internet-of-Things (IoT). Unlike traditional communication systems focusing on source data accuracy, GO-COMs provide intelligent message delivery catering to the special needs critical to accomplishing downstream tasks at the receiver. In this work, we present a novel GO-COM framework, namely LaMI-GO that utilizes emerging generative AI for better quality-of-service (QoS) with ultra-high communication efficiency. Specifically, we design our LaMI-GO system backbone based on a latent diffusion model followed by a vector-quantized generative adversarial network (VQGAN) for efficient latent embedding and information representation. The system trains a common feature codebook the receiver side. Our experimental results demonstrate substantial improvement in perceptual quality, accuracy of downstream tasks, and bandwidth consumption over the state-of-the-art GOCOM systems and establish the power of our proposed LaMI-GO communication framework. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.17839v1-abstract-full').style.display = 'none'; document.getElementById('2412.17839v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Under review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.14846">arXiv:2412.14846</a> <span> [<a href="https://arxiv.org/pdf/2412.14846">pdf</a>, <a href="https://arxiv.org/format/2412.14846">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Head and Neck Tumor Segmentation of MRI from Pre- and Mid-radiotherapy with Pre-training, Data Augmentation and Dual Flow UNet </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+L">Litingyu Wang</a>, <a href="/search/eess?searchtype=author&query=Liao%2C+W">Wenjun Liao</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+S">Shichuan Zhang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+G">Guotai Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.14846v1-abstract-short" style="display: inline;"> Head and neck tumors and metastatic lymph nodes are crucial for treatment planning and prognostic analysis. Accurate segmentation and quantitative analysis of these structures require pixel-level annotation, making automated segmentation techniques essential for the diagnosis and treatment of head and neck cancer. In this study, we investigated the effects of multiple strategies on the segmentatio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.14846v1-abstract-full').style.display = 'inline'; document.getElementById('2412.14846v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.14846v1-abstract-full" style="display: none;"> Head and neck tumors and metastatic lymph nodes are crucial for treatment planning and prognostic analysis. Accurate segmentation and quantitative analysis of these structures require pixel-level annotation, making automated segmentation techniques essential for the diagnosis and treatment of head and neck cancer. In this study, we investigated the effects of multiple strategies on the segmentation of pre-radiotherapy (pre-RT) and mid-radiotherapy (mid-RT) images. For the segmentation of pre-RT images, we utilized: 1) a fully supervised learning approach, and 2) the same approach enhanced with pre-trained weights and the MixUp data augmentation technique. For mid-RT images, we introduced a novel computational-friendly network architecture that features separate encoders for mid-RT images and registered pre-RT images with their labels. The mid-RT encoder branch integrates information from pre-RT images and labels progressively during the forward propagation. We selected the highest-performing model from each fold and used their predictions to create an ensemble average for inference. In the final test, our models achieved a segmentation performance of 82.38% for pre-RT and 72.53% for mid-RT on aggregated Dice Similarity Coefficient (DSC) as HiLab. Our code is available at https://github.com/WltyBY/HNTS-MRG2024_train_code. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.14846v1-abstract-full').style.display = 'none'; document.getElementById('2412.14846v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.11882">arXiv:2412.11882</a> <span> [<a href="https://arxiv.org/pdf/2412.11882">pdf</a>, <a href="https://arxiv.org/format/2412.11882">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Hardware-in-the-loop Simulation Testbed for Geomagnetic Navigation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yang%2C+S">Songnan Yang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+S">Shiliang Zhang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Q">Qianyun Zhang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+X">Xiaohui Zhang</a>, <a href="/search/eess?searchtype=author&query=Ma%2C+X">Xuehui Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.11882v1-abstract-short" style="display: inline;"> Geomagnetic navigation leverages the ubiquitous Earth's magnetic signals to navigate missions, without dependence on GPS services or pre-stored geographic maps. It has drawn increasing attention and is promising particularly for long-range navigation into unexplored areas. Current geomagnetic navigation studies are still in the early stages with simulations and computational validations, without c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.11882v1-abstract-full').style.display = 'inline'; document.getElementById('2412.11882v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.11882v1-abstract-full" style="display: none;"> Geomagnetic navigation leverages the ubiquitous Earth's magnetic signals to navigate missions, without dependence on GPS services or pre-stored geographic maps. It has drawn increasing attention and is promising particularly for long-range navigation into unexplored areas. Current geomagnetic navigation studies are still in the early stages with simulations and computational validations, without concrete efforts to develop cost-friendly test platforms that can empower deployment and experimental analysis of the developed approaches. This paper presents a hardware-in-the-loop simulation testbed to support geomagnetic navigation experimentation. Our testbed is dedicated to synthesizing geomagnetic field environment for the navigation. We develop the software in the testbed to simulate the dynamics of the navigation environment, and we build the hardware to generate the physical magnetic field, which follows and aligns with the simulated environment. The testbed aims to provide controllable magnetic field that can be used to experiment with geomagnetic navigation in labs, thus avoiding real and expensive navigation experiments, e.g., in the ocean, for validating navigation prototypes. We build the testbed with off-the-shelf hardware in an unshielded environment to reduce cost. We also develop the field generation control and hardware parameter optimization for quality magnetic field generation. We conduct a detailed performance analysis to show the quality of the field generation by the testbed, and we report the experimental results on performance indicators, including accuracy, uniformity, stability, and convergence of the generated field towards the target geomagnetic environment. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.11882v1-abstract-full').style.display = 'none'; document.getElementById('2412.11882v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.10117">arXiv:2412.10117</a> <span> [<a href="https://arxiv.org/pdf/2412.10117">pdf</a>, <a href="https://arxiv.org/format/2412.10117">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> CosyVoice 2: Scalable Streaming Speech Synthesis with Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Du%2C+Z">Zhihao Du</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yuxuan Wang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Q">Qian Chen</a>, <a href="/search/eess?searchtype=author&query=Shi%2C+X">Xian Shi</a>, <a href="/search/eess?searchtype=author&query=Lv%2C+X">Xiang Lv</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+T">Tianyu Zhao</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+Z">Zhifu Gao</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+Y">Yexin Yang</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+C">Changfeng Gao</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+H">Hui Wang</a>, <a href="/search/eess?searchtype=author&query=Yu%2C+F">Fan Yu</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+H">Huadai Liu</a>, <a href="/search/eess?searchtype=author&query=Sheng%2C+Z">Zhengyan Sheng</a>, <a href="/search/eess?searchtype=author&query=Gu%2C+Y">Yue Gu</a>, <a href="/search/eess?searchtype=author&query=Deng%2C+C">Chong Deng</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+W">Wen Wang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+S">Shiliang Zhang</a>, <a href="/search/eess?searchtype=author&query=Yan%2C+Z">Zhijie Yan</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+J">Jingren Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.10117v3-abstract-short" style="display: inline;"> In our previous work, we introduced CosyVoice, a multilingual speech synthesis model based on supervised discrete speech tokens. By employing progressive semantic decoding with two popular generative models, language models (LMs) and Flow Matching, CosyVoice demonstrated high prosody naturalness, content consistency, and speaker similarity in speech in-context learning. Recently, significant progr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.10117v3-abstract-full').style.display = 'inline'; document.getElementById('2412.10117v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.10117v3-abstract-full" style="display: none;"> In our previous work, we introduced CosyVoice, a multilingual speech synthesis model based on supervised discrete speech tokens. By employing progressive semantic decoding with two popular generative models, language models (LMs) and Flow Matching, CosyVoice demonstrated high prosody naturalness, content consistency, and speaker similarity in speech in-context learning. Recently, significant progress has been made in multi-modal large language models (LLMs), where the response latency and real-time factor of speech synthesis play a crucial role in the interactive experience. Therefore, in this report, we present an improved streaming speech synthesis model, CosyVoice 2, which incorporates comprehensive and systematic optimizations. Specifically, we introduce finite-scalar quantization to improve the codebook utilization of speech tokens. For the text-speech LM, we streamline the model architecture to allow direct use of a pre-trained LLM as the backbone. In addition, we develop a chunk-aware causal flow matching model to support various synthesis scenarios, enabling both streaming and non-streaming synthesis within a single model. By training on a large-scale multilingual dataset, CosyVoice 2 achieves human-parity naturalness, minimal response latency, and virtually lossless synthesis quality in the streaming mode. We invite readers to listen to the demos at https://funaudiollm.github.io/cosyvoice2. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.10117v3-abstract-full').style.display = 'none'; document.getElementById('2412.10117v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Tech report, work in progress</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.09998">arXiv:2412.09998</a> <span> [<a href="https://arxiv.org/pdf/2412.09998">pdf</a>, <a href="https://arxiv.org/format/2412.09998">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Cycle-Consistent Bridge Diffusion Model for Accelerated MRI Reconstruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Song%2C+T">Tao Song</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+Y">Yicheng Wu</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+M">Minhao Hu</a>, <a href="/search/eess?searchtype=author&query=Luo%2C+X">Xiangde Luo</a>, <a href="/search/eess?searchtype=author&query=Luo%2C+G">Guoting Luo</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+G">Guotai Wang</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+Y">Yi Guo</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+F">Feng Xu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+S">Shaoting Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.09998v1-abstract-short" style="display: inline;"> Accelerated MRI reconstruction techniques aim to reduce examination time while maintaining high image fidelity, which is highly desirable in clinical settings for improving patient comfort and hospital efficiency. Existing deep learning methods typically reconstruct images from under-sampled data with traditional reconstruction approaches, but they still struggle to provide high-fidelity results.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.09998v1-abstract-full').style.display = 'inline'; document.getElementById('2412.09998v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.09998v1-abstract-full" style="display: none;"> Accelerated MRI reconstruction techniques aim to reduce examination time while maintaining high image fidelity, which is highly desirable in clinical settings for improving patient comfort and hospital efficiency. Existing deep learning methods typically reconstruct images from under-sampled data with traditional reconstruction approaches, but they still struggle to provide high-fidelity results. Diffusion models show great potential to improve fidelity of generated images in recent years. However, their inference process starting with a random Gaussian noise introduces instability into the results and usually requires thousands of sampling steps, resulting in sub-optimal reconstruction quality and low efficiency. To address these challenges, we propose Cycle-Consistent Bridge Diffusion Model (CBDM). CBDM employs two bridge diffusion models to construct a cycle-consistent diffusion process with a consistency loss, enhancing the fine-grained details of reconstructed images and reducing the number of diffusion steps. Moreover, CBDM incorporates a Contourlet Decomposition Embedding Module (CDEM) which captures multi-scale structural texture knowledge in images through frequency domain decomposition pyramids and directional filter banks to improve structural fidelity. Extensive experiments demonstrate the superiority of our model by higher reconstruction quality and fewer training iterations, achieving a new state of the art for accelerated MRI reconstruction in both fastMRI and IXI datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.09998v1-abstract-full').style.display = 'none'; document.getElementById('2412.09998v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.06980">arXiv:2412.06980</a> <span> [<a href="https://arxiv.org/pdf/2412.06980">pdf</a>, <a href="https://arxiv.org/format/2412.06980">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Diff-GO$^\text{n}$: Enhancing Diffusion Models for Goal-Oriented Communications </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wanninayaka%2C+S">Suchinthaka Wanninayaka</a>, <a href="/search/eess?searchtype=author&query=Wijesinghe%2C+A">Achintha Wijesinghe</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+W">Weiwei Wang</a>, <a href="/search/eess?searchtype=author&query=Chao%2C+Y">Yu-Chieh Chao</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+S">Songyang Zhang</a>, <a href="/search/eess?searchtype=author&query=Ding%2C+Z">Zhi Ding</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.06980v2-abstract-short" style="display: inline;"> The rapid expansion of edge devices and Internet-of-Things (IoT) continues to heighten the demand for data transport under limited spectrum resources. The goal-oriented communications (GO-COM), unlike traditional communication systems designed for bit-level accuracy, prioritizes more critical information for specific application goals at the receiver. To improve the efficiency of generative learni… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06980v2-abstract-full').style.display = 'inline'; document.getElementById('2412.06980v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.06980v2-abstract-full" style="display: none;"> The rapid expansion of edge devices and Internet-of-Things (IoT) continues to heighten the demand for data transport under limited spectrum resources. The goal-oriented communications (GO-COM), unlike traditional communication systems designed for bit-level accuracy, prioritizes more critical information for specific application goals at the receiver. To improve the efficiency of generative learning models for GO-COM, this work introduces a novel noise-restricted diffusion-based GO-COM (Diff-GO$^\text{n}$) framework for reducing bandwidth overhead while preserving the media quality at the receiver. Specifically, we propose an innovative Noise-Restricted Forward Diffusion (NR-FD) framework to accelerate model training and reduce the computation burden for diffusion-based GO-COMs by leveraging a pre-sampled pseudo-random noise bank (NB). Moreover, we design an early stopping criterion for improving computational efficiency and convergence speed, allowing high-quality generation in fewer training steps. Our experimental results demonstrate superior perceptual quality of data transmission at a reduced bandwidth usage and lower computation, making Diff-GO$^\text{n}$ well-suited for real-time communications and downstream applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06980v2-abstract-full').style.display = 'none'; document.getElementById('2412.06980v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to IEEE International Conference on Communications (ICC) 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.06965">arXiv:2412.06965</a> <span> [<a href="https://arxiv.org/pdf/2412.06965">pdf</a>, <a href="https://arxiv.org/format/2412.06965">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Improving Source Extraction with Diffusion and Consistency Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Karchkhadze%2C+T">Tornike Karchkhadze</a>, <a href="/search/eess?searchtype=author&query=Izadi%2C+M+R">Mohammad Rasool Izadi</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+S">Shuo Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.06965v1-abstract-short" style="display: inline;"> In this work, we demonstrate the integration of a score-matching diffusion model into a deterministic architecture for time-domain musical source extraction, resulting in enhanced audio quality. To address the typically slow iterative sampling process of diffusion models, we apply consistency distillation and reduce the sampling process to a single step, achieving performance comparable to that of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06965v1-abstract-full').style.display = 'inline'; document.getElementById('2412.06965v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.06965v1-abstract-full" style="display: none;"> In this work, we demonstrate the integration of a score-matching diffusion model into a deterministic architecture for time-domain musical source extraction, resulting in enhanced audio quality. To address the typically slow iterative sampling process of diffusion models, we apply consistency distillation and reduce the sampling process to a single step, achieving performance comparable to that of diffusion models, and with two or more steps, even surpassing them. Trained on the Slakh2100 dataset for four instruments (bass, drums, guitar, and piano), our model shows significant improvements across objective metrics compared to baseline methods. Sound examples are available at https://consistency-separation.github.io/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06965v1-abstract-full').style.display = 'none'; document.getElementById('2412.06965v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.03959">arXiv:2412.03959</a> <span> [<a href="https://arxiv.org/pdf/2412.03959">pdf</a>, <a href="https://arxiv.org/format/2412.03959">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Is FISHER All You Need in The Multi-AUV Underwater Target Tracking Task? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Xu%2C+J">Jingzehua Xu</a>, <a href="/search/eess?searchtype=author&query=Xie%2C+G">Guanwen Xie</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Z">Ziqi Zhang</a>, <a href="/search/eess?searchtype=author&query=Hou%2C+X">Xiangwang Hou</a>, <a href="/search/eess?searchtype=author&query=Ma%2C+D">Dongfang Ma</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+S">Shuai Zhang</a>, <a href="/search/eess?searchtype=author&query=Ren%2C+Y">Yong Ren</a>, <a href="/search/eess?searchtype=author&query=Niyato%2C+D">Dusit Niyato</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.03959v1-abstract-short" style="display: inline;"> It is significant to employ multiple autonomous underwater vehicles (AUVs) to execute the underwater target tracking task collaboratively. However, it's pretty challenging to meet various prerequisites utilizing traditional control methods. Therefore, we propose an effective two-stage learning from demonstrations training framework, FISHER, to highlight the adaptability of reinforcement learning (… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.03959v1-abstract-full').style.display = 'inline'; document.getElementById('2412.03959v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.03959v1-abstract-full" style="display: none;"> It is significant to employ multiple autonomous underwater vehicles (AUVs) to execute the underwater target tracking task collaboratively. However, it's pretty challenging to meet various prerequisites utilizing traditional control methods. Therefore, we propose an effective two-stage learning from demonstrations training framework, FISHER, to highlight the adaptability of reinforcement learning (RL) methods in the multi-AUV underwater target tracking task, while addressing its limitations such as extensive requirements for environmental interactions and the challenges in designing reward functions. The first stage utilizes imitation learning (IL) to realize policy improvement and generate offline datasets. To be specific, we introduce multi-agent discriminator-actor-critic based on improvements of the generative adversarial IL algorithm and multi-agent IL optimization objective derived from the Nash equilibrium condition. Then in the second stage, we develop multi-agent independent generalized decision transformer, which analyzes the latent representation to match the future states of high-quality samples rather than reward function, attaining further enhanced policies capable of handling various scenarios. Besides, we propose a simulation to simulation demonstration generation procedure to facilitate the generation of expert demonstrations in underwater environments, which capitalizes on traditional control methods and can easily accomplish the domain transfer to obtain demonstrations. Extensive simulation experiments from multiple scenarios showcase that FISHER possesses strong stability, multi-task performance and capability of generalization. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.03959v1-abstract-full').style.display = 'none'; document.getElementById('2412.03959v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> IEEE Transactions on Mobile Computing 2025 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.03388">arXiv:2412.03388</a> <span> [<a href="https://arxiv.org/pdf/2412.03388">pdf</a>, <a href="https://arxiv.org/format/2412.03388">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> DiffStyleTTS: Diffusion-based Hierarchical Prosody Modeling for Text-to-Speech with Diverse and Controllable Styles </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Liu%2C+J">Jiaxuan Liu</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+Z">Zhaoci Liu</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+Y">Yajun Hu</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+Y">Yingying Gao</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+S">Shilei Zhang</a>, <a href="/search/eess?searchtype=author&query=Ling%2C+Z">Zhenhua Ling</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.03388v1-abstract-short" style="display: inline;"> Human speech exhibits rich and flexible prosodic variations. To address the one-to-many mapping problem from text to prosody in a reasonable and flexible manner, we propose DiffStyleTTS, a multi-speaker acoustic model based on a conditional diffusion module and an improved classifier-free guidance, which hierarchically models speech prosodic features, and controls different prosodic styles to guid… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.03388v1-abstract-full').style.display = 'inline'; document.getElementById('2412.03388v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.03388v1-abstract-full" style="display: none;"> Human speech exhibits rich and flexible prosodic variations. To address the one-to-many mapping problem from text to prosody in a reasonable and flexible manner, we propose DiffStyleTTS, a multi-speaker acoustic model based on a conditional diffusion module and an improved classifier-free guidance, which hierarchically models speech prosodic features, and controls different prosodic styles to guide prosody prediction. Experiments show that our method outperforms all baselines in naturalness and achieves superior synthesis speed compared to three diffusion-based baselines. Additionally, by adjusting the guiding scale, DiffStyleTTS effectively controls the guidance intensity of the synthetic prosody. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.03388v1-abstract-full').style.display = 'none'; document.getElementById('2412.03388v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">COLING 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.00820">arXiv:2412.00820</a> <span> [<a href="https://arxiv.org/pdf/2412.00820">pdf</a>, <a href="https://arxiv.org/format/2412.00820">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Non-Terrestrial Networking for 6G: Evolution, Opportunities, and Future Directions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+F">Feng Wang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+S">Shengyu Zhang</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+H">Huiting Yang</a>, <a href="/search/eess?searchtype=author&query=Quek%2C+T+Q+S">Tony Q. S. Quek</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.00820v1-abstract-short" style="display: inline;"> From 5G onwards, Non-Terrestrial Networks (NTNs) have emerged as a key component of future network architectures. Leveraging Low Earth Orbit (LEO) satellite constellations, NTNs are capable of building a space Internet and present a paradigm shift in delivering mobile services to even the most remote regions on Earth. However, the extensive coverage and rapid movement of LEO satellites pose unique… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.00820v1-abstract-full').style.display = 'inline'; document.getElementById('2412.00820v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.00820v1-abstract-full" style="display: none;"> From 5G onwards, Non-Terrestrial Networks (NTNs) have emerged as a key component of future network architectures. Leveraging Low Earth Orbit (LEO) satellite constellations, NTNs are capable of building a space Internet and present a paradigm shift in delivering mobile services to even the most remote regions on Earth. However, the extensive coverage and rapid movement of LEO satellites pose unique challenges for NTN networking, including user equipment (UE) access and inter-satellite delivery, which directly impact the quality of service (QoS) and data transmission continuity. This paper offers an in-depth review of advanced NTN management technologies in the context of 6G evolution, focusing on radio resource management, mobility management, and dynamic network slicing. Building on this foundation and considering the latest trends in NTN development, we then present some innovative perspectives to emerging challenges in satellite beamforming, handover mechanisms, and inter-satellite transmissions. Lastly, we identify open research issues and propose future directions aimed at advancing satellite Internet deployment and enhancing NTN performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.00820v1-abstract-full').style.display = 'none'; document.getElementById('2412.00820v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14684">arXiv:2411.14684</a> <span> [<a href="https://arxiv.org/pdf/2411.14684">pdf</a>, <a href="https://arxiv.org/format/2411.14684">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Cross Group Attention and Group-wise Rolling for Multimodal Medical Image Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Song%2C+T">Tao Song</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+Y">Yicheng Wu</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+M">Minhao Hu</a>, <a href="/search/eess?searchtype=author&query=Luo%2C+X">Xiangde Luo</a>, <a href="/search/eess?searchtype=author&query=Wei%2C+L">Linda Wei</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+G">Guotai Wang</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+Y">Yi Guo</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+F">Feng Xu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+S">Shaoting Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14684v1-abstract-short" style="display: inline;"> Multimodal MR image synthesis aims to generate missing modality image by fusing and mapping a few available MRI data. Most existing approaches typically adopt an image-to-image translation scheme. However, these methods often suffer from sub-optimal performance due to the spatial misalignment between different modalities while they are typically treated as input channels. Therefore, in this paper,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14684v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14684v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14684v1-abstract-full" style="display: none;"> Multimodal MR image synthesis aims to generate missing modality image by fusing and mapping a few available MRI data. Most existing approaches typically adopt an image-to-image translation scheme. However, these methods often suffer from sub-optimal performance due to the spatial misalignment between different modalities while they are typically treated as input channels. Therefore, in this paper, we propose an Adaptive Group-wise Interaction Network (AGI-Net) that explores both inter-modality and intra-modality relationships for multimodal MR image synthesis. Specifically, groups are first pre-defined along the channel dimension and then we perform an adaptive rolling for the standard convolutional kernel to capture inter-modality spatial correspondences. At the same time, a cross-group attention module is introduced to fuse information across different channel groups, leading to better feature representation. We evaluated the effectiveness of our model on the publicly available IXI and BraTS2023 datasets, where the AGI-Net achieved state-of-the-art performance for multimodal MR image synthesis. Code will be released. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14684v1-abstract-full').style.display = 'none'; document.getElementById('2411.14684v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12273">arXiv:2411.12273</a> <span> [<a href="https://arxiv.org/pdf/2411.12273">pdf</a>, <a href="https://arxiv.org/format/2411.12273">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Acquire Precise and Comparable Fundus Image Quality Score: FTHNet and FQS Dataset </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Gong%2C+Z">Zheng Gong</a>, <a href="/search/eess?searchtype=author&query=Deng%2C+Z">Zhuo Deng</a>, <a href="/search/eess?searchtype=author&query=Gan%2C+R">Run Gan</a>, <a href="/search/eess?searchtype=author&query=Niu%2C+Z">Zhiyuan Niu</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+L">Lu Chen</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+C">Canfeng Huang</a>, <a href="/search/eess?searchtype=author&query=Liang%2C+J">Jia Liang</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+W">Weihao Gao</a>, <a href="/search/eess?searchtype=author&query=Li%2C+F">Fang Li</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+S">Shaochong Zhang</a>, <a href="/search/eess?searchtype=author&query=Ma%2C+L">Lan Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12273v1-abstract-short" style="display: inline;"> The retinal fundus images are utilized extensively in the diagnosis, and their quality can directly affect the diagnosis results. However, due to the insufficient dataset and algorithm application, current fundus image quality assessment (FIQA) methods are not powerful enough to meet ophthalmologists` demands. In this paper, we address the limitations of datasets and algorithms in FIQA. First, we… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12273v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12273v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12273v1-abstract-full" style="display: none;"> The retinal fundus images are utilized extensively in the diagnosis, and their quality can directly affect the diagnosis results. However, due to the insufficient dataset and algorithm application, current fundus image quality assessment (FIQA) methods are not powerful enough to meet ophthalmologists` demands. In this paper, we address the limitations of datasets and algorithms in FIQA. First, we establish a new FIQA dataset, Fundus Quality Score(FQS), which includes 2246 fundus images with two labels: a continuous Mean Opinion Score varying from 0 to 100 and a three-level quality label. Then, we propose a FIQA Transformer-based Hypernetwork (FTHNet) to solve these tasks with regression results rather than classification results in conventional FIQA works. The FTHNet is optimized for the FIQA tasks with extensive experiments. Results on our FQS dataset show that the FTHNet can give quality scores for fundus images with PLCC of 0.9423 and SRCC of 0.9488, significantly outperforming other methods with fewer parameters and less computation complexity.We successfully build a dataset and model addressing the problems of current FIQA methods. Furthermore, the model deployment experiments demonstrate its potential in automatic medical image quality control. All experiments are carried out with 10-fold cross-validation to ensure the significance of the results. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12273v1-abstract-full').style.display = 'none'; document.getElementById('2411.12273v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 7 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11030">arXiv:2411.11030</a> <span> [<a href="https://arxiv.org/pdf/2411.11030">pdf</a>, <a href="https://arxiv.org/format/2411.11030">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> IREE Oriented Active RIS-Assisted Green communication System with Outdated CSI </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Cao%2C+K">Kai Cao</a>, <a href="/search/eess?searchtype=author&query=Yu%2C+T">Tao Yu</a>, <a href="/search/eess?searchtype=author&query=Li%2C+J">Jihong Li</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+X">Xiaojing Chen</a>, <a href="/search/eess?searchtype=author&query=Sun%2C+Y">Yanzan Sun</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+Q">Qingqing Wu</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+W">Wen Chen</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+S">Shunqing Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11030v1-abstract-short" style="display: inline;"> The rapid evolution of communication technologies has spurred a growing demand for energy-efficient network architectures and performance metrics. Active Reconfigurable Intelligent Surfaces (RIS) are emerging as a key component in green network architectures. Compared to passive RIS, active RIS are equipped with amplifiers on each reflecting element, allowing them to simultaneously reflect and amp… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11030v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11030v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11030v1-abstract-full" style="display: none;"> The rapid evolution of communication technologies has spurred a growing demand for energy-efficient network architectures and performance metrics. Active Reconfigurable Intelligent Surfaces (RIS) are emerging as a key component in green network architectures. Compared to passive RIS, active RIS are equipped with amplifiers on each reflecting element, allowing them to simultaneously reflect and amplify signals, thereby overcoming the double multiplicative fading in the phase response, and improving both system coverage and performance. Additionally, the Integrated Relative Energy Efficiency (IREE) metric, as introduced in [1], addresses the dynamic variations in traffic and capacity over time and space, enabling more energy-efficient wireless systems. Building on these advancements, this paper investigates the problem of maximizing IREE in active RIS-assisted green communication systems. However, acquiring perfect Channel State Information (CSI) in practical systems poses significant challenges and costs. To address this, we derive the average achievable rate based on outdated CSI and formulated the corresponding IREE maximization problem, which is solved by jointly optimizing beamforming at both the base station and RIS. Given the non-convex nature of the problem, we propose an Alternating Optimization Successive Approximation (AOSO) algorithm. By applying quadratic transform and relaxation techniques, we simplify the original problem and alternately optimize the beamforming matrices at the base station and RIS. Furthermore, to handle the discrete constraints of the RIS reflection coefficients, we develop a successive approximation method. Experimental results validate our theoretical analysis of the algorithm's convergence , demonstrating the effectiveness of the proposed algorithm and highlighting the superiority of IREE in enhancing the performance of green communication networks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11030v1-abstract-full').style.display = 'none'; document.getElementById('2411.11030v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06540">arXiv:2411.06540</a> <span> [<a href="https://arxiv.org/pdf/2411.06540">pdf</a>, <a href="https://arxiv.org/format/2411.06540">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Debatts: Zero-Shot Debating Text-to-Speech Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Huang%2C+Y">Yiqiao Huang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yuancheng Wang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+J">Jiaqi Li</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+H">Haotian Guo</a>, <a href="/search/eess?searchtype=author&query=He%2C+H">Haorui He</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+S">Shunsi Zhang</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+Z">Zhizheng Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06540v2-abstract-short" style="display: inline;"> In debating, rebuttal is one of the most critical stages, where a speaker addresses the arguments presented by the opposing side. During this process, the speaker synthesizes their own persuasive articulation given the context from the opposing side. This work proposes a novel zero-shot text-to-speech synthesis system for rebuttal, namely Debatts. Debatts takes two speech prompts, one from the opp… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06540v2-abstract-full').style.display = 'inline'; document.getElementById('2411.06540v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06540v2-abstract-full" style="display: none;"> In debating, rebuttal is one of the most critical stages, where a speaker addresses the arguments presented by the opposing side. During this process, the speaker synthesizes their own persuasive articulation given the context from the opposing side. This work proposes a novel zero-shot text-to-speech synthesis system for rebuttal, namely Debatts. Debatts takes two speech prompts, one from the opposing side (i.e. opponent) and one from the speaker. The prompt from the opponent is supposed to provide debating style prosody, and the prompt from the speaker provides identity information. In particular, we pretrain the Debatts system from in-the-wild dataset, and integrate an additional reference encoder to take debating prompt for style. In addition, we also create a debating dataset to develop Debatts. In this setting, Debatts can generate a debating-style speech in rebuttal for any voices. Experimental results confirm the effectiveness of the proposed system in comparison with the classic zero-shot TTS systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06540v2-abstract-full').style.display = 'none'; document.getElementById('2411.06540v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06437">arXiv:2411.06437</a> <span> [<a href="https://arxiv.org/pdf/2411.06437">pdf</a>, <a href="https://arxiv.org/format/2411.06437">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> CTC-Assisted LLM-Based Contextual ASR </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yang%2C+G">Guanrou Yang</a>, <a href="/search/eess?searchtype=author&query=Ma%2C+Z">Ziyang Ma</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+Z">Zhifu Gao</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+S">Shiliang Zhang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+X">Xie Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06437v1-abstract-short" style="display: inline;"> Contextual ASR or hotword customization holds substantial practical value. Despite the impressive performance of current end-to-end (E2E) automatic speech recognition (ASR) systems, they often face challenges in accurately recognizing rare words. Typical E2E contextual ASR models commonly feature complex architectures and decoding mechanisms, limited in performance and susceptible to interference… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06437v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06437v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06437v1-abstract-full" style="display: none;"> Contextual ASR or hotword customization holds substantial practical value. Despite the impressive performance of current end-to-end (E2E) automatic speech recognition (ASR) systems, they often face challenges in accurately recognizing rare words. Typical E2E contextual ASR models commonly feature complex architectures and decoding mechanisms, limited in performance and susceptible to interference from distractor words. With large language model (LLM)-based ASR models emerging as the new mainstream, we propose a CTC-Assisted LLM-Based Contextual ASR model with an efficient filtering algorithm. By using coarse CTC decoding results to filter potential relevant hotwords and incorporating them into LLM prompt input, our model attains WER/B-WER of 1.27%/3.67% and 2.72%/8.02% on the Librispeech test-clean and test-other sets targeting on recognizing rare long-tail words, demonstrating significant improvements compared to the baseline LLM-based ASR model, and substantially surpassing other related work. More remarkably, with the help of the large language model and proposed filtering algorithm, our contextual ASR model still performs well with 2000 biasing words. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06437v1-abstract-full').style.display = 'none'; document.getElementById('2411.06437v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">SLT 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04382">arXiv:2411.04382</a> <span> [<a href="https://arxiv.org/pdf/2411.04382">pdf</a>, <a href="https://arxiv.org/ps/2411.04382">ps</a>, <a href="https://arxiv.org/format/2411.04382">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Holographic-Pattern Based Multi-User Beam Training in RHS-Aided Hybrid Near-Field and Far-Field Communications </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+S">Shupei Zhang</a>, <a href="/search/eess?searchtype=author&query=Di%2C+B">Boya Di</a>, <a href="/search/eess?searchtype=author&query=Kaushik%2C+A">Aryan Kaushik</a>, <a href="/search/eess?searchtype=author&query=Eldar%2C+Y+C">Yonina C. Eldar</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04382v1-abstract-short" style="display: inline;"> Reconfigurable holographic surfaces (RHSs) have been suggested as an energy-efficient solution for extremely large-scale arrays. By controlling the amplitude of RHS elements, high-gain directional holographic patterns can be achieved. However, the complexity of acquiring real-time channel state information (CSI) for beamforming is exceedingly high, particularly in large-scale RHS-assisted communic… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04382v1-abstract-full').style.display = 'inline'; document.getElementById('2411.04382v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04382v1-abstract-full" style="display: none;"> Reconfigurable holographic surfaces (RHSs) have been suggested as an energy-efficient solution for extremely large-scale arrays. By controlling the amplitude of RHS elements, high-gain directional holographic patterns can be achieved. However, the complexity of acquiring real-time channel state information (CSI) for beamforming is exceedingly high, particularly in large-scale RHS-assisted communications, where users may distribute in the near-field region of RHS. This paper proposes a one-shot multi-user beam training scheme in large-scale RHS-assisted systems applicable to both near and far fields. The proposed beam training scheme comprises two phases: angle search and distance search, both conducted simultaneously for all users. For the angle search, an RHS angular codebook is designed based on holographic principles so that each codeword covers multiple angles in both near-field and far-field regions, enabling simultaneous angular search for all users. For the distance search, we construct the distance-adaptive codewords covering all candidate angles of users in a real-time way by leveraging the additivity of holographic patterns, which is different from the traditional phase array case. Simulation results demonstrate that the proposed scheme achieves higher system throughput compared to traditional beam training schemes. The beam training accuracy approaches the upper bound of exhaustive search at a significantly reduced overhead. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04382v1-abstract-full').style.display = 'none'; document.getElementById('2411.04382v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 15 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00413">arXiv:2411.00413</a> <span> [<a href="https://arxiv.org/pdf/2411.00413">pdf</a>, <a href="https://arxiv.org/format/2411.00413">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Multi-Uncertainty Aware Autonomous Cooperative Planning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+S">Shiyao Zhang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+H">He Li</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+S">Shengyu Zhang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+S">Shuai Wang</a>, <a href="/search/eess?searchtype=author&query=Ng%2C+D+W+K">Derrick Wing Kwan Ng</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+C">Chengzhong Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00413v1-abstract-short" style="display: inline;"> Autonomous cooperative planning (ACP) is a promising technique to improve the efficiency and safety of multi-vehicle interactions for future intelligent transportation systems. However, realizing robust ACP is a challenge due to the aggregation of perception, motion, and communication uncertainties. This paper proposes a novel multi-uncertainty aware ACP (MUACP) framework that simultaneously accou… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00413v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00413v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00413v1-abstract-full" style="display: none;"> Autonomous cooperative planning (ACP) is a promising technique to improve the efficiency and safety of multi-vehicle interactions for future intelligent transportation systems. However, realizing robust ACP is a challenge due to the aggregation of perception, motion, and communication uncertainties. This paper proposes a novel multi-uncertainty aware ACP (MUACP) framework that simultaneously accounts for multiple types of uncertainties via regularized cooperative model predictive control (RC-MPC). The regularizers and constraints for perception, motion, and communication are constructed according to the confidence levels, weather conditions, and outage probabilities, respectively. The effectiveness of the proposed method is evaluated in the Car Learning to Act (CARLA) simulation platform. Results demonstrate that the proposed MUACP efficiently performs cooperative formation in real time and outperforms other benchmark approaches in various scenarios under imperfect knowledge of the environment. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00413v1-abstract-full').style.display = 'none'; document.getElementById('2411.00413v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00335">arXiv:2411.00335</a> <span> [<a href="https://arxiv.org/pdf/2411.00335">pdf</a>, <a href="https://arxiv.org/format/2411.00335">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> NCST: Neural-based Color Style Transfer for Video Retouching </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Jiang%2C+X">Xintao Jiang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yaosen Chen</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+S">Siqin Zhang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+W">Wei Wang</a>, <a href="/search/eess?searchtype=author&query=Wen%2C+X">Xuming Wen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00335v1-abstract-short" style="display: inline;"> Video color style transfer aims to transform the color style of an original video by using a reference style image. Most existing methods employ neural networks, which come with challenges like opaque transfer processes and limited user control over the outcomes. Typically, users cannot fine-tune the resulting images or videos. To tackle this issue, we introduce a method that predicts specific par… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00335v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00335v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00335v1-abstract-full" style="display: none;"> Video color style transfer aims to transform the color style of an original video by using a reference style image. Most existing methods employ neural networks, which come with challenges like opaque transfer processes and limited user control over the outcomes. Typically, users cannot fine-tune the resulting images or videos. To tackle this issue, we introduce a method that predicts specific parameters for color style transfer using two images. Initially, we train a neural network to learn the corresponding color adjustment parameters. When applying style transfer to a video, we fine-tune the network with key frames from the video and the chosen style image, generating precise transformation parameters. These are then applied to convert the color style of both images and videos. Our experimental results demonstrate that our algorithm surpasses current methods in color style transfer quality. Moreover, each parameter in our method has a specific, interpretable meaning, enabling users to understand the color style transfer process and allowing them to perform manual fine-tuning if desired. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00335v1-abstract-full').style.display = 'none'; document.getElementById('2411.00335v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 8 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21951">arXiv:2410.21951</a> <span> [<a href="https://arxiv.org/pdf/2410.21951">pdf</a>, <a href="https://arxiv.org/format/2410.21951">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Fast and High-Quality Auto-Regressive Speech Synthesis via Speculative Decoding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+B">Bohan Li</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+H">Hankun Wang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+S">Situo Zhang</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+Y">Yiwei Guo</a>, <a href="/search/eess?searchtype=author&query=Yu%2C+K">Kai Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21951v2-abstract-short" style="display: inline;"> The auto-regressive architecture, like GPTs, is widely used in modern Text-to-Speech (TTS) systems. However, it incurs substantial inference time, particularly due to the challenges in the next-token prediction posed by lengthy sequences of speech tokens. In this work, we introduce VADUSA, one of the first approaches to accelerate auto-regressive TTS through speculative decoding. Our results show… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21951v2-abstract-full').style.display = 'inline'; document.getElementById('2410.21951v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21951v2-abstract-full" style="display: none;"> The auto-regressive architecture, like GPTs, is widely used in modern Text-to-Speech (TTS) systems. However, it incurs substantial inference time, particularly due to the challenges in the next-token prediction posed by lengthy sequences of speech tokens. In this work, we introduce VADUSA, one of the first approaches to accelerate auto-regressive TTS through speculative decoding. Our results show that VADUSA not only significantly improves inference speed but also enhances performance by incorporating draft heads to predict future speech content auto-regressively. Furthermore, the inclusion of a tolerance mechanism during sampling accelerates inference without compromising quality. Our approach demonstrates strong generalization across large datasets and various types of speech tokens. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21951v2-abstract-full').style.display = 'none'; document.getElementById('2410.21951v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICASSP 2025</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68T07 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21351">arXiv:2410.21351</a> <span> [<a href="https://arxiv.org/pdf/2410.21351">pdf</a>, <a href="https://arxiv.org/format/2410.21351">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> LinFormer: A Linear-based Lightweight Transformer Architecture For Time-Aware MIMO Channel Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Jin%2C+Y">Yanliang Jin</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+Y">Yifan Wu</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+Y">Yuan Gao</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+S">Shunqing Zhang</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+S">Shugong Xu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+C">Cheng-Xiang Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21351v1-abstract-short" style="display: inline;"> The emergence of 6th generation (6G) mobile networks brings new challenges in supporting high-mobility communications, particularly in addressing the issue of channel aging. While existing channel prediction methods offer improved accuracy at the expense of increased computational complexity, limiting their practical application in mobile networks. To address these challenges, we present LinFormer… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21351v1-abstract-full').style.display = 'inline'; document.getElementById('2410.21351v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21351v1-abstract-full" style="display: none;"> The emergence of 6th generation (6G) mobile networks brings new challenges in supporting high-mobility communications, particularly in addressing the issue of channel aging. While existing channel prediction methods offer improved accuracy at the expense of increased computational complexity, limiting their practical application in mobile networks. To address these challenges, we present LinFormer, an innovative channel prediction framework based on a scalable, all-linear, encoder-only Transformer model. Our approach, inspired by natural language processing (NLP) models such as BERT, adapts an encoder-only architecture specifically for channel prediction tasks. We propose replacing the computationally intensive attention mechanism commonly used in Transformers with a time-aware multi-layer perceptron (TMLP), significantly reducing computational demands. The inherent time awareness of TMLP module makes it particularly suitable for channel prediction tasks. We enhance LinFormer's training process by employing a weighted mean squared error loss (WMSELoss) function and data augmentation techniques, leveraging larger, readily available communication datasets. Our approach achieves a substantial reduction in computational complexity while maintaining high prediction accuracy, making it more suitable for deployment in cost-effective base stations (BS). Comprehensive experiments using both simulated and measured data demonstrate that LinFormer outperforms existing methods across various mobility scenarios, offering a promising solution for future wireless communication systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21351v1-abstract-full').style.display = 'none'; document.getElementById('2410.21351v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20336">arXiv:2410.20336</a> <span> [<a href="https://arxiv.org/pdf/2410.20336">pdf</a>, <a href="https://arxiv.org/format/2410.20336">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Get Large Language Models Ready to Speak: A Late-fusion Approach for Speech Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Shen%2C+M">Maohao Shen</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+S">Shun Zhang</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+J">Jilong Wu</a>, <a href="/search/eess?searchtype=author&query=Xiu%2C+Z">Zhiping Xiu</a>, <a href="/search/eess?searchtype=author&query=AlBadawy%2C+E">Ehab AlBadawy</a>, <a href="/search/eess?searchtype=author&query=Lu%2C+Y">Yiting Lu</a>, <a href="/search/eess?searchtype=author&query=Seltzer%2C+M">Mike Seltzer</a>, <a href="/search/eess?searchtype=author&query=He%2C+Q">Qing He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20336v1-abstract-short" style="display: inline;"> Large language models (LLMs) have revolutionized natural language processing (NLP) with impressive performance across various text-based tasks. However, the extension of text-dominant LLMs to with speech generation tasks remains under-explored. In this work, we introduce a text-to-speech (TTS) system powered by a fine-tuned Llama model, named TTS-Llama, that achieves state-of-the-art speech synthe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20336v1-abstract-full').style.display = 'inline'; document.getElementById('2410.20336v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20336v1-abstract-full" style="display: none;"> Large language models (LLMs) have revolutionized natural language processing (NLP) with impressive performance across various text-based tasks. However, the extension of text-dominant LLMs to with speech generation tasks remains under-explored. In this work, we introduce a text-to-speech (TTS) system powered by a fine-tuned Llama model, named TTS-Llama, that achieves state-of-the-art speech synthesis performance. Building on TTS-Llama, we further propose MoLE-Llama, a text-and-speech multimodal LLM developed through purely late-fusion parameter-efficient fine-tuning (PEFT) and a mixture-of-expert architecture. Extensive empirical results demonstrate MoLE-Llama's competitive performance on both text-only question-answering (QA) and TTS tasks, mitigating catastrophic forgetting issue in either modality. Finally, we further explore MoLE-Llama in text-in-speech-out QA tasks, demonstrating its great potential as a multimodal dialog system capable of speech generation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20336v1-abstract-full').style.display = 'none'; document.getElementById('2410.20336v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20304">arXiv:2410.20304</a> <span> [<a href="https://arxiv.org/pdf/2410.20304">pdf</a>, <a href="https://arxiv.org/ps/2410.20304">ps</a>, <a href="https://arxiv.org/format/2410.20304">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Deep Learning, Machine Learning -- Digital Signal and Image Processing: From Theory to Application </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Hsieh%2C+W">Weiche Hsieh</a>, <a href="/search/eess?searchtype=author&query=Bi%2C+Z">Ziqian Bi</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+J">Junyu Liu</a>, <a href="/search/eess?searchtype=author&query=Peng%2C+B">Benji Peng</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+S">Sen Zhang</a>, <a href="/search/eess?searchtype=author&query=Pan%2C+X">Xuanhe Pan</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+J">Jiawei Xu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+J">Jinlang Wang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+K">Keyu Chen</a>, <a href="/search/eess?searchtype=author&query=Yin%2C+C+H">Caitlyn Heqi Yin</a>, <a href="/search/eess?searchtype=author&query=Feng%2C+P">Pohsun Feng</a>, <a href="/search/eess?searchtype=author&query=Wen%2C+Y">Yizhu Wen</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+T">Tianyang Wang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+M">Ming Li</a>, <a href="/search/eess?searchtype=author&query=Ren%2C+J">Jintao Ren</a>, <a href="/search/eess?searchtype=author&query=Niu%2C+Q">Qian Niu</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+S">Silin Chen</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+M">Ming Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20304v1-abstract-short" style="display: inline;"> Digital Signal Processing (DSP) and Digital Image Processing (DIP) with Machine Learning (ML) and Deep Learning (DL) are popular research areas in Computer Vision and related fields. We highlight transformative applications in image enhancement, filtering techniques, and pattern recognition. By integrating frameworks like the Discrete Fourier Transform (DFT), Z-Transform, and Fourier Transform met… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20304v1-abstract-full').style.display = 'inline'; document.getElementById('2410.20304v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20304v1-abstract-full" style="display: none;"> Digital Signal Processing (DSP) and Digital Image Processing (DIP) with Machine Learning (ML) and Deep Learning (DL) are popular research areas in Computer Vision and related fields. We highlight transformative applications in image enhancement, filtering techniques, and pattern recognition. By integrating frameworks like the Discrete Fourier Transform (DFT), Z-Transform, and Fourier Transform methods, we enable robust data manipulation and feature extraction essential for AI-driven tasks. Using Python, we implement algorithms that optimize real-time data processing, forming a foundation for scalable, high-performance solutions in computer vision. This work illustrates the potential of ML and DL to advance DSP and DIP methodologies, contributing to artificial intelligence, automated feature extraction, and applications across diverse domains. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20304v1-abstract-full').style.display = 'none'; document.getElementById('2410.20304v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">293 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.17799">arXiv:2410.17799</a> <span> [<a href="https://arxiv.org/pdf/2410.17799">pdf</a>, <a href="https://arxiv.org/format/2410.17799">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> OmniFlatten: An End-to-end GPT Model for Seamless Voice Conversation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+Q">Qinglin Zhang</a>, <a href="/search/eess?searchtype=author&query=Cheng%2C+L">Luyao Cheng</a>, <a href="/search/eess?searchtype=author&query=Deng%2C+C">Chong Deng</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Q">Qian Chen</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+W">Wen Wang</a>, <a href="/search/eess?searchtype=author&query=Zheng%2C+S">Siqi Zheng</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+J">Jiaqing Liu</a>, <a href="/search/eess?searchtype=author&query=Yu%2C+H">Hai Yu</a>, <a href="/search/eess?searchtype=author&query=Tan%2C+C">Chaohong Tan</a>, <a href="/search/eess?searchtype=author&query=Du%2C+Z">Zhihao Du</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+S">Shiliang Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.17799v2-abstract-short" style="display: inline;"> Full-duplex spoken dialogue systems significantly surpass traditional turn-based dialogue systems, as they allow simultaneous bidirectional communication, closely mirroring human-human interactions. However, achieving low latency and natural interactions in full-duplex dialogue systems remains a significant challenge, especially considering human conversation dynamics such as interruptions, backch… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17799v2-abstract-full').style.display = 'inline'; document.getElementById('2410.17799v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.17799v2-abstract-full" style="display: none;"> Full-duplex spoken dialogue systems significantly surpass traditional turn-based dialogue systems, as they allow simultaneous bidirectional communication, closely mirroring human-human interactions. However, achieving low latency and natural interactions in full-duplex dialogue systems remains a significant challenge, especially considering human conversation dynamics such as interruptions, backchannels, and overlapping speech. In this paper, we introduce a novel End-to-End GPT-based model OmniFlatten for full-duplex conversation, capable of effectively modeling the complex behaviors inherent to natural conversations with low latency. To achieve full-duplex conversation capabilities, we propose a multi-stage post-training scheme that progressively adapts a text large language model (LLM) backbone into a speech-text dialogue LLM, capable of generating text and speech in real time, without modifying the architecture of the backbone LLM. The training process comprises three stages: modality alignment, half-duplex dialogue learning, and full-duplex dialogue learning. In all training stages, we standardize the data using a flattening operation, which enables unifying the training methods and the GPT backbone across different modalities and tasks. Our approach offers a simple modeling technique and a promising research direction for developing efficient and natural end-to-end full-duplex spoken dialogue systems. Audio samples of dialogues generated by OmniFlatten can be found at this web site (https://omniflatten.github.io/). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17799v2-abstract-full').style.display = 'none'; document.getElementById('2410.17799v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Work in progress</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Zhang%2C+S&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Zhang%2C+S&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+S&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+S&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+S&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+S&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository